From 42f9cc64eb5e7c7edd42962f18814c0713237e41 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 13 May 2026 12:50:24 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.1-8B-Instruct_SFT_mathfisher_v00.03 Source: Original Platform --- .gitattributes | 36 + README.md | 60 + all_results.json | 9 + chat_template.jinja | 121 + config.json | 36 + generation_config.json | 9 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 299 + special_tokens_map.json | 11 + tokenizer.json | 3 + tokenizer_config.json | 2063 + train_results.json | 9 + trainer_state.json | 235874 ++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 238545 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..337ed59 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/OpenR1-Math-220k_all_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_mathfisher_v00.03 +tags: +- generated_from_trainer +- sft +- open-r1 +- trl +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_mathfisher_v00.03 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/OpenR1-Math-220k_all_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/OpenR1-Math-220k_all_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_mathfisher_v00.03", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_math/runs/56tyvnza) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.0.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.4 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..403473b --- /dev/null +++ b/all_results.json @@ -0,0 +1,9 @@ +{ + "ewc_loss": 8.285045623779297e-06, + "total_flos": 4.051147348618982e+19, + "train_loss": 0.44752783132196916, + "train_runtime": 39436.9749, + "train_samples": 125770, + "train_samples_per_second": 9.567, + "train_steps_per_second": 0.598 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..06df27b --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..50f6077 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..755a4a2 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:134e8459ca64b1ae6f6630fd8ae98f2c808d7952ab1d5b57fb46c0ca2baa93dd +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..8d1d769 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3df6e147d004e8fbd02268e9d29d9bc28cdb93156697a667389a59ccb17869f +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..2e1c5ce --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65f3f4fb3ed00c2dda3ae5886782497bfdd98c427a89d8e9259770615d63fb70 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..b146185 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fbbda3b8160f00fb8eaf7f6ff7ab6450604250770778a40660747ed1d9676a1 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9d4773c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,11 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3beeacc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..403473b --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "ewc_loss": 8.285045623779297e-06, + "total_flos": 4.051147348618982e+19, + "train_loss": 0.44752783132196916, + "train_runtime": 39436.9749, + "train_samples": 125770, + "train_samples_per_second": 9.567, + "train_steps_per_second": 0.598 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..4bda62a --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,235874 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23583, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012721027859051011, + "ewc_loss": 0.0, + "grad_norm": 4.835970401763916, + "learning_rate": 0.0, + "loss": 0.7982, + "mean_token_accuracy": 0.7762961387634277, + "num_tokens": 38493.0, + "step": 1 + }, + { + "epoch": 0.00025442055718102023, + "ewc_loss": 0.0, + "grad_norm": 4.588061809539795, + "learning_rate": 4.2390843577787196e-10, + "loss": 0.8329, + "mean_token_accuracy": 0.765798807144165, + "num_tokens": 80419.0, + "step": 2 + }, + { + "epoch": 0.0003816308357715303, + "ewc_loss": 2.930733997499879e-19, + "grad_norm": 4.726269245147705, + "learning_rate": 8.478168715557439e-10, + "loss": 0.7225, + "mean_token_accuracy": 0.7960126996040344, + "num_tokens": 118717.0, + "step": 3 + }, + { + "epoch": 0.0005088411143620405, + "ewc_loss": 2.3635607360183997e-17, + "grad_norm": 5.365743160247803, + "learning_rate": 1.271725307333616e-09, + "loss": 0.8139, + "mean_token_accuracy": 0.7711055278778076, + "num_tokens": 150155.0, + "step": 4 + }, + { + "epoch": 0.0006360513929525506, + "ewc_loss": 6.245004513516506e-16, + "grad_norm": 4.301177978515625, + "learning_rate": 1.6956337431114878e-09, + "loss": 0.7919, + "mean_token_accuracy": 0.7747106552124023, + "num_tokens": 193616.0, + "step": 5 + }, + { + "epoch": 0.0007632616715430606, + "ewc_loss": 2.0122792321330962e-15, + "grad_norm": 5.215522289276123, + "learning_rate": 2.1195421788893596e-09, + "loss": 0.7893, + "mean_token_accuracy": 0.7785683870315552, + "num_tokens": 227640.0, + "step": 6 + }, + { + "epoch": 0.0008904719501335708, + "ewc_loss": 4.107825191113079e-15, + "grad_norm": 4.832206726074219, + "learning_rate": 2.543450614667232e-09, + "loss": 0.8161, + "mean_token_accuracy": 0.7746812105178833, + "num_tokens": 265114.0, + "step": 7 + }, + { + "epoch": 0.001017682228724081, + "ewc_loss": 1.532107773982716e-14, + "grad_norm": 4.93550443649292, + "learning_rate": 2.967359050445104e-09, + "loss": 0.7582, + "mean_token_accuracy": 0.7883400917053223, + "num_tokens": 299865.0, + "step": 8 + }, + { + "epoch": 0.001144892507314591, + "ewc_loss": 2.220446049250313e-14, + "grad_norm": 4.560969352722168, + "learning_rate": 3.3912674862229757e-09, + "loss": 0.814, + "mean_token_accuracy": 0.7736219167709351, + "num_tokens": 342063.0, + "step": 9 + }, + { + "epoch": 0.0012721027859051012, + "ewc_loss": 5.1514348342607263e-14, + "grad_norm": 5.446834564208984, + "learning_rate": 3.815175922000847e-09, + "loss": 0.8641, + "mean_token_accuracy": 0.7630735635757446, + "num_tokens": 374864.0, + "step": 10 + }, + { + "epoch": 0.0013993130644956112, + "ewc_loss": 9.281464485866309e-14, + "grad_norm": 4.300546646118164, + "learning_rate": 4.239084357778719e-09, + "loss": 0.7765, + "mean_token_accuracy": 0.7749488949775696, + "num_tokens": 416605.0, + "step": 11 + }, + { + "epoch": 0.0015265233430861213, + "ewc_loss": 1.3677947663381929e-13, + "grad_norm": 5.489023208618164, + "learning_rate": 4.662992793556591e-09, + "loss": 0.8346, + "mean_token_accuracy": 0.7689482569694519, + "num_tokens": 448798.0, + "step": 12 + }, + { + "epoch": 0.0016537336216766315, + "ewc_loss": 1.5987211554602254e-13, + "grad_norm": 5.4578704833984375, + "learning_rate": 5.086901229334464e-09, + "loss": 0.8739, + "mean_token_accuracy": 0.7540760040283203, + "num_tokens": 480084.0, + "step": 13 + }, + { + "epoch": 0.0017809439002671415, + "ewc_loss": 5.186961971048731e-13, + "grad_norm": 4.392617225646973, + "learning_rate": 5.510809665112336e-09, + "loss": 0.8058, + "mean_token_accuracy": 0.7792627811431885, + "num_tokens": 524543.0, + "step": 14 + }, + { + "epoch": 0.0019081541788576518, + "ewc_loss": 7.425171588693047e-13, + "grad_norm": 4.7859320640563965, + "learning_rate": 5.934718100890208e-09, + "loss": 0.7517, + "mean_token_accuracy": 0.7884218692779541, + "num_tokens": 563314.0, + "step": 15 + }, + { + "epoch": 0.002035364457448162, + "ewc_loss": 8.064660050877137e-13, + "grad_norm": 5.071033000946045, + "learning_rate": 6.3586265366680796e-09, + "loss": 0.8426, + "mean_token_accuracy": 0.7630425691604614, + "num_tokens": 598421.0, + "step": 16 + }, + { + "epoch": 0.002162574736038672, + "ewc_loss": 9.663381206337363e-13, + "grad_norm": 5.0327982902526855, + "learning_rate": 6.782534972445951e-09, + "loss": 0.8319, + "mean_token_accuracy": 0.7624252438545227, + "num_tokens": 634690.0, + "step": 17 + }, + { + "epoch": 0.002289785014629182, + "ewc_loss": 1.2008172234345693e-12, + "grad_norm": 5.137806415557861, + "learning_rate": 7.206443408223823e-09, + "loss": 0.8691, + "mean_token_accuracy": 0.7595454454421997, + "num_tokens": 674653.0, + "step": 18 + }, + { + "epoch": 0.0024169952932196924, + "ewc_loss": 4.604316927725449e-12, + "grad_norm": 5.067828178405762, + "learning_rate": 7.630351844001695e-09, + "loss": 0.8004, + "mean_token_accuracy": 0.7775569558143616, + "num_tokens": 708238.0, + "step": 19 + }, + { + "epoch": 0.0025442055718102024, + "ewc_loss": 5.7127635955112055e-12, + "grad_norm": 4.621129512786865, + "learning_rate": 8.054260279779567e-09, + "loss": 0.7828, + "mean_token_accuracy": 0.7785258293151855, + "num_tokens": 749312.0, + "step": 20 + }, + { + "epoch": 0.0026714158504007124, + "ewc_loss": 6.366462912410498e-12, + "grad_norm": 5.429529190063477, + "learning_rate": 8.478168715557438e-09, + "loss": 0.8528, + "mean_token_accuracy": 0.7667032480239868, + "num_tokens": 783532.0, + "step": 21 + }, + { + "epoch": 0.0027986261289912225, + "ewc_loss": 6.73594513500575e-12, + "grad_norm": 5.08949613571167, + "learning_rate": 8.902077151335311e-09, + "loss": 0.8302, + "mean_token_accuracy": 0.7733227014541626, + "num_tokens": 817429.0, + "step": 22 + }, + { + "epoch": 0.0029258364075817325, + "ewc_loss": 7.844391802791506e-12, + "grad_norm": 4.8283796310424805, + "learning_rate": 9.325985587113182e-09, + "loss": 0.7637, + "mean_token_accuracy": 0.7882852554321289, + "num_tokens": 853964.0, + "step": 23 + }, + { + "epoch": 0.0030530466861722425, + "ewc_loss": 8.526512829121202e-12, + "grad_norm": 5.440274238586426, + "learning_rate": 9.749894022891054e-09, + "loss": 0.8275, + "mean_token_accuracy": 0.7672711610794067, + "num_tokens": 885070.0, + "step": 24 + }, + { + "epoch": 0.003180256964762753, + "ewc_loss": 9.15179043659009e-12, + "grad_norm": 4.823431015014648, + "learning_rate": 1.0173802458668929e-08, + "loss": 0.8799, + "mean_token_accuracy": 0.7513006925582886, + "num_tokens": 926893.0, + "step": 25 + }, + { + "epoch": 0.003307467243353263, + "ewc_loss": 3.0468072509393096e-11, + "grad_norm": 4.866101264953613, + "learning_rate": 1.05977108944468e-08, + "loss": 0.8161, + "mean_token_accuracy": 0.7679038047790527, + "num_tokens": 964773.0, + "step": 26 + }, + { + "epoch": 0.003434677521943773, + "ewc_loss": 3.956301952712238e-11, + "grad_norm": 4.738644123077393, + "learning_rate": 1.1021619330224672e-08, + "loss": 0.7692, + "mean_token_accuracy": 0.785671591758728, + "num_tokens": 1002725.0, + "step": 27 + }, + { + "epoch": 0.003561887800534283, + "ewc_loss": 4.388311936054379e-11, + "grad_norm": 4.964737892150879, + "learning_rate": 1.1445527766002543e-08, + "loss": 0.8734, + "mean_token_accuracy": 0.7579348087310791, + "num_tokens": 1040296.0, + "step": 28 + }, + { + "epoch": 0.003689098079124793, + "ewc_loss": 4.6838977141305804e-11, + "grad_norm": 4.437984943389893, + "learning_rate": 1.1869436201780416e-08, + "loss": 0.7658, + "mean_token_accuracy": 0.7855533957481384, + "num_tokens": 1081711.0, + "step": 29 + }, + { + "epoch": 0.0038163083577153036, + "ewc_loss": 4.9112713895738125e-11, + "grad_norm": 4.812241554260254, + "learning_rate": 1.2293344637558287e-08, + "loss": 0.8119, + "mean_token_accuracy": 0.7759097814559937, + "num_tokens": 1120556.0, + "step": 30 + }, + { + "epoch": 0.003943518636305814, + "ewc_loss": 5.32054400537163e-11, + "grad_norm": 4.7543840408325195, + "learning_rate": 1.2717253073336159e-08, + "loss": 0.7637, + "mean_token_accuracy": 0.7900336980819702, + "num_tokens": 1157723.0, + "step": 31 + }, + { + "epoch": 0.004070728914896324, + "ewc_loss": 5.820766091346741e-11, + "grad_norm": 4.66197395324707, + "learning_rate": 1.314116150911403e-08, + "loss": 0.8099, + "mean_token_accuracy": 0.7743831276893616, + "num_tokens": 1197879.0, + "step": 32 + }, + { + "epoch": 0.004197939193486834, + "ewc_loss": 6.184563972055912e-11, + "grad_norm": 4.754628658294678, + "learning_rate": 1.3565069944891903e-08, + "loss": 0.8106, + "mean_token_accuracy": 0.7751972675323486, + "num_tokens": 1237342.0, + "step": 33 + }, + { + "epoch": 0.004325149472077344, + "ewc_loss": 6.548361852765083e-11, + "grad_norm": 4.491781711578369, + "learning_rate": 1.3988978380669775e-08, + "loss": 0.7819, + "mean_token_accuracy": 0.7822901606559753, + "num_tokens": 1280197.0, + "step": 34 + }, + { + "epoch": 0.004452359750667854, + "ewc_loss": 7.958078640513122e-11, + "grad_norm": 4.625627517700195, + "learning_rate": 1.4412886816447646e-08, + "loss": 0.7642, + "mean_token_accuracy": 0.784464955329895, + "num_tokens": 1318625.0, + "step": 35 + }, + { + "epoch": 0.004579570029258364, + "ewc_loss": 2.255546860396862e-10, + "grad_norm": 4.751155853271484, + "learning_rate": 1.4836795252225519e-08, + "loss": 0.8217, + "mean_token_accuracy": 0.7719884514808655, + "num_tokens": 1356868.0, + "step": 36 + }, + { + "epoch": 0.004706780307848874, + "ewc_loss": 2.7830537874251604e-10, + "grad_norm": 4.799177169799805, + "learning_rate": 1.526070368800339e-08, + "loss": 0.7937, + "mean_token_accuracy": 0.7807905673980713, + "num_tokens": 1394696.0, + "step": 37 + }, + { + "epoch": 0.004833990586439385, + "ewc_loss": 3.1104718800634146e-10, + "grad_norm": 4.342911720275879, + "learning_rate": 1.5684612123781262e-08, + "loss": 0.7829, + "mean_token_accuracy": 0.7821173667907715, + "num_tokens": 1438738.0, + "step": 38 + }, + { + "epoch": 0.004961200865029895, + "ewc_loss": 3.3287506084889174e-10, + "grad_norm": 4.7009358406066895, + "learning_rate": 1.6108520559559135e-08, + "loss": 0.7482, + "mean_token_accuracy": 0.7921982407569885, + "num_tokens": 1475089.0, + "step": 39 + }, + { + "epoch": 0.005088411143620405, + "ewc_loss": 3.4924596548080444e-10, + "grad_norm": 4.691748142242432, + "learning_rate": 1.6532428995337004e-08, + "loss": 0.8082, + "mean_token_accuracy": 0.7744965553283691, + "num_tokens": 1514566.0, + "step": 40 + }, + { + "epoch": 0.005215621422210915, + "ewc_loss": 3.637978807091713e-10, + "grad_norm": 4.896240234375, + "learning_rate": 1.6956337431114877e-08, + "loss": 0.8231, + "mean_token_accuracy": 0.7735510468482971, + "num_tokens": 1552560.0, + "step": 41 + }, + { + "epoch": 0.005342831700801425, + "ewc_loss": 3.710738383233547e-10, + "grad_norm": 5.321018218994141, + "learning_rate": 1.738024586689275e-08, + "loss": 0.8004, + "mean_token_accuracy": 0.7773172855377197, + "num_tokens": 1584759.0, + "step": 42 + }, + { + "epoch": 0.005470041979391935, + "ewc_loss": 3.8744474295526743e-10, + "grad_norm": 4.812631607055664, + "learning_rate": 1.7804154302670622e-08, + "loss": 0.8023, + "mean_token_accuracy": 0.7759655714035034, + "num_tokens": 1621825.0, + "step": 43 + }, + { + "epoch": 0.005597252257982445, + "ewc_loss": 4.129105946049094e-10, + "grad_norm": 4.70981502532959, + "learning_rate": 1.8228062738448494e-08, + "loss": 0.8275, + "mean_token_accuracy": 0.7681852579116821, + "num_tokens": 1662946.0, + "step": 44 + }, + { + "epoch": 0.005724462536572955, + "ewc_loss": 4.3291947804391384e-10, + "grad_norm": 4.5986104011535645, + "learning_rate": 1.8651971174226364e-08, + "loss": 0.8155, + "mean_token_accuracy": 0.7731042504310608, + "num_tokens": 1699433.0, + "step": 45 + }, + { + "epoch": 0.005851672815163465, + "ewc_loss": 4.511093720793724e-10, + "grad_norm": 4.283040523529053, + "learning_rate": 1.9075879610004236e-08, + "loss": 0.7462, + "mean_token_accuracy": 0.7888737916946411, + "num_tokens": 1742812.0, + "step": 46 + }, + { + "epoch": 0.005978883093753975, + "ewc_loss": 4.5656634029001e-10, + "grad_norm": 4.810732841491699, + "learning_rate": 1.949978804578211e-08, + "loss": 0.8019, + "mean_token_accuracy": 0.7786438465118408, + "num_tokens": 1778725.0, + "step": 47 + }, + { + "epoch": 0.006106093372344485, + "ewc_loss": 4.656612873077393e-10, + "grad_norm": 4.759642601013184, + "learning_rate": 1.9923696481559985e-08, + "loss": 0.848, + "mean_token_accuracy": 0.7613486051559448, + "num_tokens": 1816592.0, + "step": 48 + }, + { + "epoch": 0.006233303650934996, + "ewc_loss": 6.984919309616089e-10, + "grad_norm": 4.313797950744629, + "learning_rate": 2.0347604917337857e-08, + "loss": 0.7397, + "mean_token_accuracy": 0.7899084091186523, + "num_tokens": 1859907.0, + "step": 49 + }, + { + "epoch": 0.006360513929525506, + "ewc_loss": 1.382431946694851e-09, + "grad_norm": 4.558185577392578, + "learning_rate": 2.0771513353115727e-08, + "loss": 0.7289, + "mean_token_accuracy": 0.7995334267616272, + "num_tokens": 1896627.0, + "step": 50 + }, + { + "epoch": 0.006487724208116016, + "ewc_loss": 1.82626536116004e-09, + "grad_norm": 4.962312698364258, + "learning_rate": 2.11954217888936e-08, + "loss": 0.8443, + "mean_token_accuracy": 0.7671120762825012, + "num_tokens": 1934041.0, + "step": 51 + }, + { + "epoch": 0.006614934486706526, + "ewc_loss": 1.949956640601158e-09, + "grad_norm": 4.439557075500488, + "learning_rate": 2.1619330224671472e-08, + "loss": 0.7736, + "mean_token_accuracy": 0.784757137298584, + "num_tokens": 1976482.0, + "step": 52 + }, + { + "epoch": 0.006742144765297036, + "ewc_loss": 2.08092387765646e-09, + "grad_norm": 5.157295227050781, + "learning_rate": 2.2043238660449344e-08, + "loss": 0.8029, + "mean_token_accuracy": 0.7720325589179993, + "num_tokens": 2009224.0, + "step": 53 + }, + { + "epoch": 0.006869355043887546, + "ewc_loss": 2.1391315385699272e-09, + "grad_norm": 4.772197723388672, + "learning_rate": 2.2467147096227214e-08, + "loss": 0.8472, + "mean_token_accuracy": 0.7618905305862427, + "num_tokens": 2049235.0, + "step": 54 + }, + { + "epoch": 0.006996565322478056, + "ewc_loss": 2.2264430299401283e-09, + "grad_norm": 4.450559139251709, + "learning_rate": 2.2891055532005086e-08, + "loss": 0.8604, + "mean_token_accuracy": 0.7614541053771973, + "num_tokens": 2090260.0, + "step": 55 + }, + { + "epoch": 0.007123775601068566, + "ewc_loss": 2.2846506908535957e-09, + "grad_norm": 4.869048118591309, + "learning_rate": 2.331496396778296e-08, + "loss": 0.8658, + "mean_token_accuracy": 0.7603353261947632, + "num_tokens": 2126686.0, + "step": 56 + }, + { + "epoch": 0.007250985879659076, + "ewc_loss": 2.2992026060819626e-09, + "grad_norm": 4.2430195808410645, + "learning_rate": 2.373887240356083e-08, + "loss": 0.7678, + "mean_token_accuracy": 0.7848402261734009, + "num_tokens": 2171355.0, + "step": 57 + }, + { + "epoch": 0.007378196158249586, + "ewc_loss": 2.2846506908535957e-09, + "grad_norm": 4.458797454833984, + "learning_rate": 2.4162780839338704e-08, + "loss": 0.7407, + "mean_token_accuracy": 0.7928599119186401, + "num_tokens": 2211660.0, + "step": 58 + }, + { + "epoch": 0.007505406436840096, + "ewc_loss": 2.2992026060819626e-09, + "grad_norm": 5.383023262023926, + "learning_rate": 2.4586689275116573e-08, + "loss": 0.8707, + "mean_token_accuracy": 0.7594878673553467, + "num_tokens": 2244411.0, + "step": 59 + }, + { + "epoch": 0.007632616715430607, + "ewc_loss": 2.342858351767063e-09, + "grad_norm": 4.1984477043151855, + "learning_rate": 2.5010597710894446e-08, + "loss": 0.7977, + "mean_token_accuracy": 0.770315945148468, + "num_tokens": 2285253.0, + "step": 60 + }, + { + "epoch": 0.007759826994021117, + "ewc_loss": 2.3865140974521637e-09, + "grad_norm": 4.352649211883545, + "learning_rate": 2.5434506146672318e-08, + "loss": 0.7621, + "mean_token_accuracy": 0.7850627899169922, + "num_tokens": 2328577.0, + "step": 61 + }, + { + "epoch": 0.007887037272611627, + "ewc_loss": 2.473825588822365e-09, + "grad_norm": 4.532900333404541, + "learning_rate": 2.585841458245019e-08, + "loss": 0.8419, + "mean_token_accuracy": 0.7696154117584229, + "num_tokens": 2366524.0, + "step": 62 + }, + { + "epoch": 0.008014247551202136, + "ewc_loss": 2.5174813345074654e-09, + "grad_norm": 4.067293167114258, + "learning_rate": 2.628232301822806e-08, + "loss": 0.7557, + "mean_token_accuracy": 0.7896590232849121, + "num_tokens": 2408628.0, + "step": 63 + }, + { + "epoch": 0.008141457829792647, + "ewc_loss": 2.5756889954209328e-09, + "grad_norm": 4.138586044311523, + "learning_rate": 2.6706231454005933e-08, + "loss": 0.7949, + "mean_token_accuracy": 0.7757278680801392, + "num_tokens": 2451800.0, + "step": 64 + }, + { + "epoch": 0.008268668108383158, + "ewc_loss": 2.6775524020195007e-09, + "grad_norm": 5.5008320808410645, + "learning_rate": 2.7130139889783805e-08, + "loss": 0.866, + "mean_token_accuracy": 0.7548337578773499, + "num_tokens": 2481448.0, + "step": 65 + }, + { + "epoch": 0.008395878386973667, + "ewc_loss": 2.6921043172478676e-09, + "grad_norm": 4.136028289794922, + "learning_rate": 2.7554048325561678e-08, + "loss": 0.7929, + "mean_token_accuracy": 0.7753580808639526, + "num_tokens": 2526339.0, + "step": 66 + }, + { + "epoch": 0.008523088665564178, + "ewc_loss": 2.7066562324762344e-09, + "grad_norm": 4.286583423614502, + "learning_rate": 2.797795676133955e-08, + "loss": 0.8179, + "mean_token_accuracy": 0.7700444459915161, + "num_tokens": 2570691.0, + "step": 67 + }, + { + "epoch": 0.008650298944154687, + "ewc_loss": 2.9831426218152046e-09, + "grad_norm": 4.401442050933838, + "learning_rate": 2.840186519711742e-08, + "loss": 0.8044, + "mean_token_accuracy": 0.7750793695449829, + "num_tokens": 2609207.0, + "step": 68 + }, + { + "epoch": 0.008777509222745198, + "ewc_loss": 5.326000973582268e-09, + "grad_norm": 4.703990936279297, + "learning_rate": 2.8825773632895292e-08, + "loss": 0.7925, + "mean_token_accuracy": 0.7772303819656372, + "num_tokens": 2645494.0, + "step": 69 + }, + { + "epoch": 0.008904719501335707, + "ewc_loss": 7.8580342233181e-09, + "grad_norm": 4.2137064933776855, + "learning_rate": 2.9249682068673165e-08, + "loss": 0.7496, + "mean_token_accuracy": 0.7919385433197021, + "num_tokens": 2686258.0, + "step": 70 + }, + { + "epoch": 0.009031929779926218, + "ewc_loss": 9.487848728895187e-09, + "grad_norm": 4.7905402183532715, + "learning_rate": 2.9673590504451037e-08, + "loss": 0.8043, + "mean_token_accuracy": 0.7778860330581665, + "num_tokens": 2720544.0, + "step": 71 + }, + { + "epoch": 0.009159140058516728, + "ewc_loss": 1.0128132998943329e-08, + "grad_norm": 4.611462116241455, + "learning_rate": 3.0097498940228907e-08, + "loss": 0.7674, + "mean_token_accuracy": 0.7861135601997375, + "num_tokens": 2758068.0, + "step": 72 + }, + { + "epoch": 0.009286350337107238, + "ewc_loss": 1.0710209608078003e-08, + "grad_norm": 4.8017802238464355, + "learning_rate": 3.052140737600678e-08, + "loss": 0.8514, + "mean_token_accuracy": 0.763062596321106, + "num_tokens": 2793342.0, + "step": 73 + }, + { + "epoch": 0.009413560615697748, + "ewc_loss": 1.0884832590818405e-08, + "grad_norm": 4.929084777832031, + "learning_rate": 3.094531581178465e-08, + "loss": 0.8884, + "mean_token_accuracy": 0.7550736665725708, + "num_tokens": 2828003.0, + "step": 74 + }, + { + "epoch": 0.009540770894288259, + "ewc_loss": 1.1117663234472275e-08, + "grad_norm": 4.040344715118408, + "learning_rate": 3.1369224247562524e-08, + "loss": 0.7637, + "mean_token_accuracy": 0.7841902375221252, + "num_tokens": 2874755.0, + "step": 75 + }, + { + "epoch": 0.00966798117287877, + "ewc_loss": 1.1292286217212677e-08, + "grad_norm": 4.402063369750977, + "learning_rate": 3.17931326833404e-08, + "loss": 0.7859, + "mean_token_accuracy": 0.7762067914009094, + "num_tokens": 2909703.0, + "step": 76 + }, + { + "epoch": 0.009795191451469279, + "ewc_loss": 1.1175870895385742e-08, + "grad_norm": 3.9020166397094727, + "learning_rate": 3.221704111911827e-08, + "loss": 0.7161, + "mean_token_accuracy": 0.7987029552459717, + "num_tokens": 2954020.0, + "step": 77 + }, + { + "epoch": 0.00992240173005979, + "ewc_loss": 1.1117663234472275e-08, + "grad_norm": 4.954975128173828, + "learning_rate": 3.264094955489614e-08, + "loss": 0.8114, + "mean_token_accuracy": 0.7750698328018188, + "num_tokens": 2984894.0, + "step": 78 + }, + { + "epoch": 0.010049612008650299, + "ewc_loss": 1.0884832590818405e-08, + "grad_norm": 4.637815475463867, + "learning_rate": 3.306485799067401e-08, + "loss": 0.7182, + "mean_token_accuracy": 0.7952472567558289, + "num_tokens": 3017773.0, + "step": 79 + }, + { + "epoch": 0.01017682228724081, + "ewc_loss": 1.05355866253376e-08, + "grad_norm": 3.978937864303589, + "learning_rate": 3.348876642645188e-08, + "loss": 0.7177, + "mean_token_accuracy": 0.7977811694145203, + "num_tokens": 3059739.0, + "step": 80 + }, + { + "epoch": 0.010304032565831319, + "ewc_loss": 1.0826624929904938e-08, + "grad_norm": 4.5625152587890625, + "learning_rate": 3.391267486222975e-08, + "loss": 0.8098, + "mean_token_accuracy": 0.7723212838172913, + "num_tokens": 3094910.0, + "step": 81 + }, + { + "epoch": 0.01043124284442183, + "ewc_loss": 1.0710209608078003e-08, + "grad_norm": 4.270747184753418, + "learning_rate": 3.4336583298007626e-08, + "loss": 0.8134, + "mean_token_accuracy": 0.7736762166023254, + "num_tokens": 3133884.0, + "step": 82 + }, + { + "epoch": 0.010558453123012339, + "ewc_loss": 1.0826624929904938e-08, + "grad_norm": 4.8308868408203125, + "learning_rate": 3.47604917337855e-08, + "loss": 0.8462, + "mean_token_accuracy": 0.7603965997695923, + "num_tokens": 3166755.0, + "step": 83 + }, + { + "epoch": 0.01068566340160285, + "ewc_loss": 1.076841726899147e-08, + "grad_norm": 4.406330108642578, + "learning_rate": 3.518440016956337e-08, + "loss": 0.815, + "mean_token_accuracy": 0.77112877368927, + "num_tokens": 3204825.0, + "step": 84 + }, + { + "epoch": 0.010812873680193359, + "ewc_loss": 1.0826624929904938e-08, + "grad_norm": 4.395390510559082, + "learning_rate": 3.5608308605341244e-08, + "loss": 0.8504, + "mean_token_accuracy": 0.7626650333404541, + "num_tokens": 3244036.0, + "step": 85 + }, + { + "epoch": 0.01094008395878387, + "ewc_loss": 1.100124791264534e-08, + "grad_norm": 4.21000337600708, + "learning_rate": 3.6032217041119116e-08, + "loss": 0.8378, + "mean_token_accuracy": 0.7670802474021912, + "num_tokens": 3285969.0, + "step": 86 + }, + { + "epoch": 0.01106729423737438, + "ewc_loss": 1.1117663234472275e-08, + "grad_norm": 4.149404048919678, + "learning_rate": 3.645612547689699e-08, + "loss": 0.8088, + "mean_token_accuracy": 0.7751160860061646, + "num_tokens": 3327648.0, + "step": 87 + }, + { + "epoch": 0.01119450451596489, + "ewc_loss": 1.1350493878126144e-08, + "grad_norm": 4.315306186676025, + "learning_rate": 3.6880033912674855e-08, + "loss": 0.812, + "mean_token_accuracy": 0.7731003761291504, + "num_tokens": 3367399.0, + "step": 88 + }, + { + "epoch": 0.0113217147945554, + "ewc_loss": 1.1816155165433884e-08, + "grad_norm": 4.500613689422607, + "learning_rate": 3.730394234845273e-08, + "loss": 0.7952, + "mean_token_accuracy": 0.7797884941101074, + "num_tokens": 3405402.0, + "step": 89 + }, + { + "epoch": 0.01144892507314591, + "ewc_loss": 1.2165401130914688e-08, + "grad_norm": 4.610160827636719, + "learning_rate": 3.77278507842306e-08, + "loss": 0.8787, + "mean_token_accuracy": 0.7541475296020508, + "num_tokens": 3441791.0, + "step": 90 + }, + { + "epoch": 0.01157613535173642, + "ewc_loss": 1.2689270079135895e-08, + "grad_norm": 4.566935062408447, + "learning_rate": 3.815175922000847e-08, + "loss": 0.838, + "mean_token_accuracy": 0.7690249085426331, + "num_tokens": 3480151.0, + "step": 91 + }, + { + "epoch": 0.01170334563032693, + "ewc_loss": 1.2747477740049362e-08, + "grad_norm": 4.42471170425415, + "learning_rate": 3.8575667655786345e-08, + "loss": 0.8017, + "mean_token_accuracy": 0.7758381366729736, + "num_tokens": 3516867.0, + "step": 92 + }, + { + "epoch": 0.01183055590891744, + "ewc_loss": 1.30385160446167e-08, + "grad_norm": 4.028701305389404, + "learning_rate": 3.899957609156422e-08, + "loss": 0.7777, + "mean_token_accuracy": 0.7830331921577454, + "num_tokens": 3557466.0, + "step": 93 + }, + { + "epoch": 0.01195776618750795, + "ewc_loss": 1.3620592653751373e-08, + "grad_norm": 4.22844123840332, + "learning_rate": 3.94234845273421e-08, + "loss": 0.8071, + "mean_token_accuracy": 0.7735814452171326, + "num_tokens": 3596009.0, + "step": 94 + }, + { + "epoch": 0.012084976466098461, + "ewc_loss": 1.4901161193847656e-08, + "grad_norm": 4.394076347351074, + "learning_rate": 3.984739296311997e-08, + "loss": 0.7991, + "mean_token_accuracy": 0.7747915387153625, + "num_tokens": 3631582.0, + "step": 95 + }, + { + "epoch": 0.01221218674468897, + "ewc_loss": 1.885928213596344e-08, + "grad_norm": 4.424163818359375, + "learning_rate": 4.027130139889784e-08, + "loss": 0.8252, + "mean_token_accuracy": 0.7680615186691284, + "num_tokens": 3672855.0, + "step": 96 + }, + { + "epoch": 0.012339397023279481, + "ewc_loss": 2.3865140974521637e-08, + "grad_norm": 4.182048797607422, + "learning_rate": 4.0695209834675715e-08, + "loss": 0.8164, + "mean_token_accuracy": 0.7731541395187378, + "num_tokens": 3714100.0, + "step": 97 + }, + { + "epoch": 0.012466607301869992, + "ewc_loss": 2.9453076422214508e-08, + "grad_norm": 4.361045837402344, + "learning_rate": 4.111911827045358e-08, + "loss": 0.7668, + "mean_token_accuracy": 0.7853004932403564, + "num_tokens": 3752834.0, + "step": 98 + }, + { + "epoch": 0.012593817580460501, + "ewc_loss": 3.189779818058014e-08, + "grad_norm": 4.395951747894287, + "learning_rate": 4.154302670623145e-08, + "loss": 0.7776, + "mean_token_accuracy": 0.7812743186950684, + "num_tokens": 3791513.0, + "step": 99 + }, + { + "epoch": 0.012721027859051012, + "ewc_loss": 3.259629011154175e-08, + "grad_norm": 4.698955535888672, + "learning_rate": 4.1966935142009326e-08, + "loss": 0.8003, + "mean_token_accuracy": 0.7753592729568481, + "num_tokens": 3825085.0, + "step": 100 + }, + { + "epoch": 0.012848238137641521, + "ewc_loss": 3.329478204250336e-08, + "grad_norm": 4.120013236999512, + "learning_rate": 4.23908435777872e-08, + "loss": 0.8011, + "mean_token_accuracy": 0.7765176296234131, + "num_tokens": 3866492.0, + "step": 101 + }, + { + "epoch": 0.012975448416232032, + "ewc_loss": 3.3527612686157227e-08, + "grad_norm": 4.437995910644531, + "learning_rate": 4.281475201356507e-08, + "loss": 0.8283, + "mean_token_accuracy": 0.7634662985801697, + "num_tokens": 3904053.0, + "step": 102 + }, + { + "epoch": 0.013102658694822541, + "ewc_loss": 3.4226104617118835e-08, + "grad_norm": 4.403983116149902, + "learning_rate": 4.3238660449342943e-08, + "loss": 0.7365, + "mean_token_accuracy": 0.7954703569412231, + "num_tokens": 3935118.0, + "step": 103 + }, + { + "epoch": 0.013229868973413052, + "ewc_loss": 3.4226104617118835e-08, + "grad_norm": 4.032034397125244, + "learning_rate": 4.3662568885120816e-08, + "loss": 0.7868, + "mean_token_accuracy": 0.7764103412628174, + "num_tokens": 3977899.0, + "step": 104 + }, + { + "epoch": 0.013357079252003561, + "ewc_loss": 3.3993273973464966e-08, + "grad_norm": 3.9524037837982178, + "learning_rate": 4.408647732089869e-08, + "loss": 0.8074, + "mean_token_accuracy": 0.7735268473625183, + "num_tokens": 4019416.0, + "step": 105 + }, + { + "epoch": 0.013484289530594072, + "ewc_loss": 3.3527612686157227e-08, + "grad_norm": 3.909682512283325, + "learning_rate": 4.451038575667656e-08, + "loss": 0.7962, + "mean_token_accuracy": 0.7732097506523132, + "num_tokens": 4063887.0, + "step": 106 + }, + { + "epoch": 0.013611499809184581, + "ewc_loss": 3.3760443329811096e-08, + "grad_norm": 4.034852027893066, + "learning_rate": 4.493429419245443e-08, + "loss": 0.8274, + "mean_token_accuracy": 0.7679883241653442, + "num_tokens": 4101309.0, + "step": 107 + }, + { + "epoch": 0.013738710087775092, + "ewc_loss": 3.3760443329811096e-08, + "grad_norm": 4.221742630004883, + "learning_rate": 4.53582026282323e-08, + "loss": 0.8118, + "mean_token_accuracy": 0.7702419757843018, + "num_tokens": 4136669.0, + "step": 108 + }, + { + "epoch": 0.013865920366365603, + "ewc_loss": 3.3993273973464966e-08, + "grad_norm": 3.6891930103302, + "learning_rate": 4.578211106401017e-08, + "loss": 0.7428, + "mean_token_accuracy": 0.7905797958374023, + "num_tokens": 4178583.0, + "step": 109 + }, + { + "epoch": 0.013993130644956112, + "ewc_loss": 3.3993273973464966e-08, + "grad_norm": 3.751206159591675, + "learning_rate": 4.6206019499788045e-08, + "loss": 0.7733, + "mean_token_accuracy": 0.7774076461791992, + "num_tokens": 4219552.0, + "step": 110 + }, + { + "epoch": 0.014120340923546623, + "ewc_loss": 3.4226104617118835e-08, + "grad_norm": 3.819380521774292, + "learning_rate": 4.662992793556592e-08, + "loss": 0.766, + "mean_token_accuracy": 0.7851861715316772, + "num_tokens": 4264487.0, + "step": 111 + }, + { + "epoch": 0.014247551202137132, + "ewc_loss": 3.3527612686157227e-08, + "grad_norm": 3.8392200469970703, + "learning_rate": 4.705383637134379e-08, + "loss": 0.7321, + "mean_token_accuracy": 0.7903293967247009, + "num_tokens": 4303015.0, + "step": 112 + }, + { + "epoch": 0.014374761480727643, + "ewc_loss": 3.3760443329811096e-08, + "grad_norm": 3.779449462890625, + "learning_rate": 4.747774480712166e-08, + "loss": 0.787, + "mean_token_accuracy": 0.7771527767181396, + "num_tokens": 4345446.0, + "step": 113 + }, + { + "epoch": 0.014501971759318152, + "ewc_loss": 3.3527612686157227e-08, + "grad_norm": 3.878913402557373, + "learning_rate": 4.7901653242899535e-08, + "loss": 0.7392, + "mean_token_accuracy": 0.790928065776825, + "num_tokens": 4383854.0, + "step": 114 + }, + { + "epoch": 0.014629182037908663, + "ewc_loss": 3.4226104617118835e-08, + "grad_norm": 3.968348264694214, + "learning_rate": 4.832556167867741e-08, + "loss": 0.8083, + "mean_token_accuracy": 0.773392379283905, + "num_tokens": 4420750.0, + "step": 115 + }, + { + "epoch": 0.014756392316499172, + "ewc_loss": 3.5157427191734314e-08, + "grad_norm": 4.135779857635498, + "learning_rate": 4.8749470114455274e-08, + "loss": 0.8052, + "mean_token_accuracy": 0.7735240459442139, + "num_tokens": 4458761.0, + "step": 116 + }, + { + "epoch": 0.014883602595089683, + "ewc_loss": 3.67872416973114e-08, + "grad_norm": 3.9840171337127686, + "learning_rate": 4.9173378550233146e-08, + "loss": 0.7652, + "mean_token_accuracy": 0.7812439203262329, + "num_tokens": 4496547.0, + "step": 117 + }, + { + "epoch": 0.015010812873680193, + "ewc_loss": 3.725290298461914e-08, + "grad_norm": 3.999772787094116, + "learning_rate": 4.959728698601102e-08, + "loss": 0.7632, + "mean_token_accuracy": 0.7896963357925415, + "num_tokens": 4533357.0, + "step": 118 + }, + { + "epoch": 0.015138023152270703, + "ewc_loss": 3.864988684654236e-08, + "grad_norm": 3.888915777206421, + "learning_rate": 5.002119542178889e-08, + "loss": 0.7377, + "mean_token_accuracy": 0.7911967039108276, + "num_tokens": 4573570.0, + "step": 119 + }, + { + "epoch": 0.015265233430861214, + "ewc_loss": 3.864988684654236e-08, + "grad_norm": 4.0846476554870605, + "learning_rate": 5.0445103857566764e-08, + "loss": 0.7929, + "mean_token_accuracy": 0.7754673957824707, + "num_tokens": 4613195.0, + "step": 120 + }, + { + "epoch": 0.015392443709451724, + "ewc_loss": 3.841705620288849e-08, + "grad_norm": 3.6409356594085693, + "learning_rate": 5.0869012293344637e-08, + "loss": 0.713, + "mean_token_accuracy": 0.7973262071609497, + "num_tokens": 4657230.0, + "step": 121 + }, + { + "epoch": 0.015519653988042234, + "ewc_loss": 3.91155481338501e-08, + "grad_norm": 3.774409532546997, + "learning_rate": 5.129292072912251e-08, + "loss": 0.7869, + "mean_token_accuracy": 0.7806633710861206, + "num_tokens": 4701436.0, + "step": 122 + }, + { + "epoch": 0.015646864266632744, + "ewc_loss": 4.0512531995773315e-08, + "grad_norm": 3.8640992641448975, + "learning_rate": 5.171682916490038e-08, + "loss": 0.7748, + "mean_token_accuracy": 0.7802762985229492, + "num_tokens": 4741806.0, + "step": 123 + }, + { + "epoch": 0.015774074545223254, + "ewc_loss": 4.1443854570388794e-08, + "grad_norm": 4.061357021331787, + "learning_rate": 5.2140737600678254e-08, + "loss": 0.8119, + "mean_token_accuracy": 0.765822172164917, + "num_tokens": 4780671.0, + "step": 124 + }, + { + "epoch": 0.015901284823813765, + "ewc_loss": 4.190951585769653e-08, + "grad_norm": 4.0038533210754395, + "learning_rate": 5.256464603645612e-08, + "loss": 0.7605, + "mean_token_accuracy": 0.7862980365753174, + "num_tokens": 4816879.0, + "step": 125 + }, + { + "epoch": 0.016028495102404273, + "ewc_loss": 4.493631422519684e-08, + "grad_norm": 4.030223846435547, + "learning_rate": 5.298855447223399e-08, + "loss": 0.813, + "mean_token_accuracy": 0.7674217820167542, + "num_tokens": 4858704.0, + "step": 126 + }, + { + "epoch": 0.016155705380994784, + "ewc_loss": 4.7497451305389404e-08, + "grad_norm": 4.068124771118164, + "learning_rate": 5.3412462908011865e-08, + "loss": 0.7696, + "mean_token_accuracy": 0.7822837829589844, + "num_tokens": 4896780.0, + "step": 127 + }, + { + "epoch": 0.016282915659585295, + "ewc_loss": 4.7963112592697144e-08, + "grad_norm": 4.172075271606445, + "learning_rate": 5.383637134378974e-08, + "loss": 0.7993, + "mean_token_accuracy": 0.7721927762031555, + "num_tokens": 4932108.0, + "step": 128 + }, + { + "epoch": 0.016410125938175806, + "ewc_loss": 5.075708031654358e-08, + "grad_norm": 3.858516216278076, + "learning_rate": 5.426027977956761e-08, + "loss": 0.7755, + "mean_token_accuracy": 0.7820805907249451, + "num_tokens": 4970384.0, + "step": 129 + }, + { + "epoch": 0.016537336216766316, + "ewc_loss": 5.564652383327484e-08, + "grad_norm": 4.203873634338379, + "learning_rate": 5.468418821534548e-08, + "loss": 0.7838, + "mean_token_accuracy": 0.777428388595581, + "num_tokens": 5005996.0, + "step": 130 + }, + { + "epoch": 0.016664546495356824, + "ewc_loss": 5.9138983488082886e-08, + "grad_norm": 4.066949844360352, + "learning_rate": 5.5108096651123356e-08, + "loss": 0.7629, + "mean_token_accuracy": 0.7813847064971924, + "num_tokens": 5044208.0, + "step": 131 + }, + { + "epoch": 0.016791756773947335, + "ewc_loss": 6.007030606269836e-08, + "grad_norm": 3.981755256652832, + "learning_rate": 5.553200508690123e-08, + "loss": 0.7775, + "mean_token_accuracy": 0.7759720683097839, + "num_tokens": 5085173.0, + "step": 132 + }, + { + "epoch": 0.016918967052537846, + "ewc_loss": 6.146728992462158e-08, + "grad_norm": 4.294731616973877, + "learning_rate": 5.59559135226791e-08, + "loss": 0.7921, + "mean_token_accuracy": 0.7740026116371155, + "num_tokens": 5118440.0, + "step": 133 + }, + { + "epoch": 0.017046177331128357, + "ewc_loss": 6.565824151039124e-08, + "grad_norm": 4.5646281242370605, + "learning_rate": 5.637982195845697e-08, + "loss": 0.7952, + "mean_token_accuracy": 0.7750462889671326, + "num_tokens": 5155202.0, + "step": 134 + }, + { + "epoch": 0.017173387609718864, + "ewc_loss": 7.310882210731506e-08, + "grad_norm": 4.349681377410889, + "learning_rate": 5.680373039423484e-08, + "loss": 0.7796, + "mean_token_accuracy": 0.7804962396621704, + "num_tokens": 5193240.0, + "step": 135 + }, + { + "epoch": 0.017300597888309375, + "ewc_loss": 7.776543498039246e-08, + "grad_norm": 3.999018430709839, + "learning_rate": 5.722763883001271e-08, + "loss": 0.7672, + "mean_token_accuracy": 0.7817463874816895, + "num_tokens": 5230975.0, + "step": 136 + }, + { + "epoch": 0.017427808166899886, + "ewc_loss": 8.381903171539307e-08, + "grad_norm": 4.325400352478027, + "learning_rate": 5.7651547265790585e-08, + "loss": 0.8278, + "mean_token_accuracy": 0.7650959491729736, + "num_tokens": 5264786.0, + "step": 137 + }, + { + "epoch": 0.017555018445490397, + "ewc_loss": 8.800998330116272e-08, + "grad_norm": 3.934995651245117, + "learning_rate": 5.807545570156846e-08, + "loss": 0.7902, + "mean_token_accuracy": 0.7752729654312134, + "num_tokens": 5301554.0, + "step": 138 + }, + { + "epoch": 0.017682228724080904, + "ewc_loss": 9.033828973770142e-08, + "grad_norm": 4.027656078338623, + "learning_rate": 5.849936413734633e-08, + "loss": 0.729, + "mean_token_accuracy": 0.7915083765983582, + "num_tokens": 5337356.0, + "step": 139 + }, + { + "epoch": 0.017809439002671415, + "ewc_loss": 9.452924132347107e-08, + "grad_norm": 3.99059796333313, + "learning_rate": 5.89232725731242e-08, + "loss": 0.8006, + "mean_token_accuracy": 0.7747078537940979, + "num_tokens": 5375767.0, + "step": 140 + }, + { + "epoch": 0.017936649281261926, + "ewc_loss": 9.592622518539429e-08, + "grad_norm": 3.9466848373413086, + "learning_rate": 5.9347181008902075e-08, + "loss": 0.7516, + "mean_token_accuracy": 0.7832291126251221, + "num_tokens": 5413683.0, + "step": 141 + }, + { + "epoch": 0.018063859559852437, + "ewc_loss": 9.918585419654846e-08, + "grad_norm": 3.4989428520202637, + "learning_rate": 5.977108944467995e-08, + "loss": 0.7335, + "mean_token_accuracy": 0.7888029217720032, + "num_tokens": 5457750.0, + "step": 142 + }, + { + "epoch": 0.018191069838442948, + "ewc_loss": 1.0058283805847168e-07, + "grad_norm": 3.621156930923462, + "learning_rate": 6.019499788045781e-08, + "loss": 0.7279, + "mean_token_accuracy": 0.790576696395874, + "num_tokens": 5495743.0, + "step": 143 + }, + { + "epoch": 0.018318280117033455, + "ewc_loss": 1.0151416063308716e-07, + "grad_norm": 3.6332390308380127, + "learning_rate": 6.061890631623569e-08, + "loss": 0.7411, + "mean_token_accuracy": 0.782193660736084, + "num_tokens": 5532198.0, + "step": 144 + }, + { + "epoch": 0.018445490395623966, + "ewc_loss": 1.0291114449501038e-07, + "grad_norm": 3.7275333404541016, + "learning_rate": 6.104281475201356e-08, + "loss": 0.7538, + "mean_token_accuracy": 0.7829538583755493, + "num_tokens": 5568977.0, + "step": 145 + }, + { + "epoch": 0.018572700674214477, + "ewc_loss": 1.0477378964424133e-07, + "grad_norm": 3.597214937210083, + "learning_rate": 6.146672318779143e-08, + "loss": 0.7664, + "mean_token_accuracy": 0.7750922441482544, + "num_tokens": 5606229.0, + "step": 146 + }, + { + "epoch": 0.018699910952804988, + "ewc_loss": 1.0570511221885681e-07, + "grad_norm": 3.3409957885742188, + "learning_rate": 6.18906316235693e-08, + "loss": 0.768, + "mean_token_accuracy": 0.777238667011261, + "num_tokens": 5649828.0, + "step": 147 + }, + { + "epoch": 0.018827121231395495, + "ewc_loss": 1.0710209608078003e-07, + "grad_norm": 3.3734796047210693, + "learning_rate": 6.231454005934718e-08, + "loss": 0.714, + "mean_token_accuracy": 0.7978212237358093, + "num_tokens": 5687433.0, + "step": 148 + }, + { + "epoch": 0.018954331509986006, + "ewc_loss": 1.0943040251731873e-07, + "grad_norm": 3.6857528686523438, + "learning_rate": 6.273844849512505e-08, + "loss": 0.7448, + "mean_token_accuracy": 0.7842670679092407, + "num_tokens": 5723247.0, + "step": 149 + }, + { + "epoch": 0.019081541788576517, + "ewc_loss": 1.0896474123001099e-07, + "grad_norm": 3.4156854152679443, + "learning_rate": 6.316235693090292e-08, + "loss": 0.7599, + "mean_token_accuracy": 0.7791236639022827, + "num_tokens": 5760305.0, + "step": 150 + }, + { + "epoch": 0.019208752067167028, + "ewc_loss": 1.0896474123001099e-07, + "grad_norm": 3.221707582473755, + "learning_rate": 6.35862653666808e-08, + "loss": 0.7607, + "mean_token_accuracy": 0.7783452868461609, + "num_tokens": 5800586.0, + "step": 151 + }, + { + "epoch": 0.01933596234575754, + "ewc_loss": 1.0943040251731873e-07, + "grad_norm": 3.371408224105835, + "learning_rate": 6.401017380245867e-08, + "loss": 0.7526, + "mean_token_accuracy": 0.779678463935852, + "num_tokens": 5840351.0, + "step": 152 + }, + { + "epoch": 0.019463172624348046, + "ewc_loss": 1.103617250919342e-07, + "grad_norm": 3.4021859169006348, + "learning_rate": 6.443408223823654e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.770854115486145, + "num_tokens": 5880162.0, + "step": 153 + }, + { + "epoch": 0.019590382902938557, + "ewc_loss": 1.1082738637924194e-07, + "grad_norm": 3.4804129600524902, + "learning_rate": 6.485799067401441e-08, + "loss": 0.8483, + "mean_token_accuracy": 0.7564669847488403, + "num_tokens": 5922213.0, + "step": 154 + }, + { + "epoch": 0.019717593181529068, + "ewc_loss": 1.1362135410308838e-07, + "grad_norm": 3.4366488456726074, + "learning_rate": 6.528189910979228e-08, + "loss": 0.7298, + "mean_token_accuracy": 0.7860912084579468, + "num_tokens": 5957461.0, + "step": 155 + }, + { + "epoch": 0.01984480346011958, + "ewc_loss": 1.150183379650116e-07, + "grad_norm": 3.3049521446228027, + "learning_rate": 6.570580754557016e-08, + "loss": 0.7152, + "mean_token_accuracy": 0.7862256765365601, + "num_tokens": 5994675.0, + "step": 156 + }, + { + "epoch": 0.019972013738710086, + "ewc_loss": 1.1408701539039612e-07, + "grad_norm": 3.3121917247772217, + "learning_rate": 6.612971598134802e-08, + "loss": 0.6783, + "mean_token_accuracy": 0.8021008968353271, + "num_tokens": 6032514.0, + "step": 157 + }, + { + "epoch": 0.020099224017300597, + "ewc_loss": 1.150183379650116e-07, + "grad_norm": 3.3329403400421143, + "learning_rate": 6.655362441712589e-08, + "loss": 0.7506, + "mean_token_accuracy": 0.7805607914924622, + "num_tokens": 6069160.0, + "step": 158 + }, + { + "epoch": 0.020226434295891108, + "ewc_loss": 1.1688098311424255e-07, + "grad_norm": 3.40339994430542, + "learning_rate": 6.697753285290376e-08, + "loss": 0.8045, + "mean_token_accuracy": 0.7653694152832031, + "num_tokens": 6106652.0, + "step": 159 + }, + { + "epoch": 0.02035364457448162, + "ewc_loss": 1.1827796697616577e-07, + "grad_norm": 3.3108606338500977, + "learning_rate": 6.740144128868163e-08, + "loss": 0.7392, + "mean_token_accuracy": 0.7859421372413635, + "num_tokens": 6143070.0, + "step": 160 + }, + { + "epoch": 0.020480854853072127, + "ewc_loss": 1.1827796697616577e-07, + "grad_norm": 2.973607063293457, + "learning_rate": 6.78253497244595e-08, + "loss": 0.7316, + "mean_token_accuracy": 0.7874528169631958, + "num_tokens": 6186495.0, + "step": 161 + }, + { + "epoch": 0.020608065131662637, + "ewc_loss": 1.1920928955078125e-07, + "grad_norm": 3.3396127223968506, + "learning_rate": 6.824925816023738e-08, + "loss": 0.7316, + "mean_token_accuracy": 0.7864516377449036, + "num_tokens": 6222454.0, + "step": 162 + }, + { + "epoch": 0.02073527541025315, + "ewc_loss": 1.2014061212539673e-07, + "grad_norm": 3.6475467681884766, + "learning_rate": 6.867316659601525e-08, + "loss": 0.7435, + "mean_token_accuracy": 0.7860056161880493, + "num_tokens": 6253260.0, + "step": 163 + }, + { + "epoch": 0.02086248568884366, + "ewc_loss": 1.2014061212539673e-07, + "grad_norm": 3.060633897781372, + "learning_rate": 6.909707503179312e-08, + "loss": 0.7255, + "mean_token_accuracy": 0.7905220985412598, + "num_tokens": 6294773.0, + "step": 164 + }, + { + "epoch": 0.02098969596743417, + "ewc_loss": 1.2014061212539673e-07, + "grad_norm": 3.3071770668029785, + "learning_rate": 6.9520983467571e-08, + "loss": 0.7689, + "mean_token_accuracy": 0.7764692306518555, + "num_tokens": 6335334.0, + "step": 165 + }, + { + "epoch": 0.021116906246024678, + "ewc_loss": 1.2014061212539673e-07, + "grad_norm": 3.2298805713653564, + "learning_rate": 6.994489190334887e-08, + "loss": 0.7035, + "mean_token_accuracy": 0.7922539710998535, + "num_tokens": 6372255.0, + "step": 166 + }, + { + "epoch": 0.02124411652461519, + "ewc_loss": 1.2014061212539673e-07, + "grad_norm": 3.0346755981445312, + "learning_rate": 7.036880033912674e-08, + "loss": 0.6803, + "mean_token_accuracy": 0.8012290000915527, + "num_tokens": 6411971.0, + "step": 167 + }, + { + "epoch": 0.0213713268032057, + "ewc_loss": 1.2014061212539673e-07, + "grad_norm": 3.7197039127349854, + "learning_rate": 7.079270877490461e-08, + "loss": 0.7498, + "mean_token_accuracy": 0.7798041105270386, + "num_tokens": 6450366.0, + "step": 168 + }, + { + "epoch": 0.02149853708179621, + "ewc_loss": 1.2293457984924316e-07, + "grad_norm": 3.5636305809020996, + "learning_rate": 7.121661721068249e-08, + "loss": 0.7128, + "mean_token_accuracy": 0.7893389463424683, + "num_tokens": 6482199.0, + "step": 169 + }, + { + "epoch": 0.021625747360386718, + "ewc_loss": 1.2386590242385864e-07, + "grad_norm": 3.16137957572937, + "learning_rate": 7.164052564646036e-08, + "loss": 0.6949, + "mean_token_accuracy": 0.7989679574966431, + "num_tokens": 6519697.0, + "step": 170 + }, + { + "epoch": 0.02175295763897723, + "ewc_loss": 1.2479722499847412e-07, + "grad_norm": 3.530195474624634, + "learning_rate": 7.206443408223823e-08, + "loss": 0.7547, + "mean_token_accuracy": 0.7789942026138306, + "num_tokens": 6554687.0, + "step": 171 + }, + { + "epoch": 0.02188016791756774, + "ewc_loss": 1.257285475730896e-07, + "grad_norm": 3.4489738941192627, + "learning_rate": 7.24883425180161e-08, + "loss": 0.7172, + "mean_token_accuracy": 0.7835267186164856, + "num_tokens": 6593066.0, + "step": 172 + }, + { + "epoch": 0.02200737819615825, + "ewc_loss": 1.2665987014770508e-07, + "grad_norm": 3.0697262287139893, + "learning_rate": 7.291225095379398e-08, + "loss": 0.676, + "mean_token_accuracy": 0.8036282658576965, + "num_tokens": 6635484.0, + "step": 173 + }, + { + "epoch": 0.02213458847474876, + "ewc_loss": 1.2665987014770508e-07, + "grad_norm": 3.032956838607788, + "learning_rate": 7.333615938957185e-08, + "loss": 0.7679, + "mean_token_accuracy": 0.7768048048019409, + "num_tokens": 6677090.0, + "step": 174 + }, + { + "epoch": 0.02226179875333927, + "ewc_loss": 1.2945383787155151e-07, + "grad_norm": 3.470280170440674, + "learning_rate": 7.376006782534971e-08, + "loss": 0.6829, + "mean_token_accuracy": 0.7974414229393005, + "num_tokens": 6710250.0, + "step": 175 + }, + { + "epoch": 0.02238900903192978, + "ewc_loss": 1.3317912817001343e-07, + "grad_norm": 2.9493422508239746, + "learning_rate": 7.418397626112758e-08, + "loss": 0.7094, + "mean_token_accuracy": 0.7892557382583618, + "num_tokens": 6752998.0, + "step": 176 + }, + { + "epoch": 0.02251621931052029, + "ewc_loss": 1.601874828338623e-07, + "grad_norm": 16.829071044921875, + "learning_rate": 7.460788469690545e-08, + "loss": 0.7103, + "mean_token_accuracy": 0.7951128482818604, + "num_tokens": 6789568.0, + "step": 177 + }, + { + "epoch": 0.0226434295891108, + "ewc_loss": 1.3783574104309082e-07, + "grad_norm": 4.2148051261901855, + "learning_rate": 7.503179313268333e-08, + "loss": 0.7506, + "mean_token_accuracy": 0.7789310216903687, + "num_tokens": 6822810.0, + "step": 178 + }, + { + "epoch": 0.02277063986770131, + "ewc_loss": 1.3969838619232178e-07, + "grad_norm": 3.1437838077545166, + "learning_rate": 7.54557015684612e-08, + "loss": 0.6896, + "mean_token_accuracy": 0.7948847413063049, + "num_tokens": 6861598.0, + "step": 179 + }, + { + "epoch": 0.02289785014629182, + "ewc_loss": 1.4156103134155273e-07, + "grad_norm": 3.1236371994018555, + "learning_rate": 7.587961000423907e-08, + "loss": 0.7114, + "mean_token_accuracy": 0.7918777465820312, + "num_tokens": 6900220.0, + "step": 180 + }, + { + "epoch": 0.02302506042488233, + "ewc_loss": 1.4528632164001465e-07, + "grad_norm": 3.5033745765686035, + "learning_rate": 7.630351844001694e-08, + "loss": 0.82, + "mean_token_accuracy": 0.7607794404029846, + "num_tokens": 6946983.0, + "step": 181 + }, + { + "epoch": 0.02315227070347284, + "ewc_loss": 1.471489667892456e-07, + "grad_norm": 3.1337568759918213, + "learning_rate": 7.672742687579482e-08, + "loss": 0.7239, + "mean_token_accuracy": 0.7893110513687134, + "num_tokens": 6986019.0, + "step": 182 + }, + { + "epoch": 0.02327948098206335, + "ewc_loss": 1.4901161193847656e-07, + "grad_norm": 3.2271106243133545, + "learning_rate": 7.715133531157269e-08, + "loss": 0.6973, + "mean_token_accuracy": 0.7902875542640686, + "num_tokens": 7020722.0, + "step": 183 + }, + { + "epoch": 0.02340669126065386, + "ewc_loss": 1.5273690223693848e-07, + "grad_norm": 3.6480703353881836, + "learning_rate": 7.757524374735056e-08, + "loss": 0.7685, + "mean_token_accuracy": 0.7758758068084717, + "num_tokens": 7062691.0, + "step": 184 + }, + { + "epoch": 0.02353390153924437, + "ewc_loss": 1.5459954738616943e-07, + "grad_norm": 3.37423038482666, + "learning_rate": 7.799915218312844e-08, + "loss": 0.716, + "mean_token_accuracy": 0.7909955382347107, + "num_tokens": 7101090.0, + "step": 185 + }, + { + "epoch": 0.02366111181783488, + "ewc_loss": 1.5739351511001587e-07, + "grad_norm": 3.032268524169922, + "learning_rate": 7.842306061890631e-08, + "loss": 0.689, + "mean_token_accuracy": 0.7925138473510742, + "num_tokens": 7139891.0, + "step": 186 + }, + { + "epoch": 0.023788322096425393, + "ewc_loss": 1.6205012798309326e-07, + "grad_norm": 3.280310869216919, + "learning_rate": 7.88469690546842e-08, + "loss": 0.7364, + "mean_token_accuracy": 0.7826349139213562, + "num_tokens": 7179501.0, + "step": 187 + }, + { + "epoch": 0.0239155323750159, + "ewc_loss": 1.6391277313232422e-07, + "grad_norm": 3.5257949829101562, + "learning_rate": 7.927087749046207e-08, + "loss": 0.7057, + "mean_token_accuracy": 0.7899432182312012, + "num_tokens": 7213809.0, + "step": 188 + }, + { + "epoch": 0.02404274265360641, + "ewc_loss": 1.6577541828155518e-07, + "grad_norm": 3.576976776123047, + "learning_rate": 7.969478592623994e-08, + "loss": 0.7797, + "mean_token_accuracy": 0.7680351138114929, + "num_tokens": 7254493.0, + "step": 189 + }, + { + "epoch": 0.024169952932196922, + "ewc_loss": 1.6670674085617065e-07, + "grad_norm": 5.030829429626465, + "learning_rate": 8.011869436201781e-08, + "loss": 0.6379, + "mean_token_accuracy": 0.8107033967971802, + "num_tokens": 7292643.0, + "step": 190 + }, + { + "epoch": 0.024297163210787433, + "ewc_loss": 1.685693860054016e-07, + "grad_norm": 3.605072021484375, + "learning_rate": 8.054260279779568e-08, + "loss": 0.7596, + "mean_token_accuracy": 0.7787445783615112, + "num_tokens": 7329810.0, + "step": 191 + }, + { + "epoch": 0.02442437348937794, + "ewc_loss": 1.7229467630386353e-07, + "grad_norm": 3.2246899604797363, + "learning_rate": 8.096651123357356e-08, + "loss": 0.745, + "mean_token_accuracy": 0.7802241444587708, + "num_tokens": 7367630.0, + "step": 192 + }, + { + "epoch": 0.02455158376796845, + "ewc_loss": 1.73225998878479e-07, + "grad_norm": 3.2347819805145264, + "learning_rate": 8.139041966935143e-08, + "loss": 0.7275, + "mean_token_accuracy": 0.7849327325820923, + "num_tokens": 7411580.0, + "step": 193 + }, + { + "epoch": 0.024678794046558962, + "ewc_loss": 1.7695128917694092e-07, + "grad_norm": 3.1981875896453857, + "learning_rate": 8.181432810512929e-08, + "loss": 0.6807, + "mean_token_accuracy": 0.7979450225830078, + "num_tokens": 7451631.0, + "step": 194 + }, + { + "epoch": 0.024806004325149473, + "ewc_loss": 1.7881393432617188e-07, + "grad_norm": 3.953582525253296, + "learning_rate": 8.223823654090716e-08, + "loss": 0.6898, + "mean_token_accuracy": 0.7944424152374268, + "num_tokens": 7493645.0, + "step": 195 + }, + { + "epoch": 0.024933214603739984, + "ewc_loss": 2.086162567138672e-07, + "grad_norm": 16.82448959350586, + "learning_rate": 8.266214497668503e-08, + "loss": 0.6495, + "mean_token_accuracy": 0.8086801767349243, + "num_tokens": 7538042.0, + "step": 196 + }, + { + "epoch": 0.02506042488233049, + "ewc_loss": 1.8533319234848022e-07, + "grad_norm": 4.16462516784668, + "learning_rate": 8.30860534124629e-08, + "loss": 0.7122, + "mean_token_accuracy": 0.7886447906494141, + "num_tokens": 7575374.0, + "step": 197 + }, + { + "epoch": 0.025187635160921002, + "ewc_loss": 1.9185245037078857e-07, + "grad_norm": 4.188424587249756, + "learning_rate": 8.350996184824078e-08, + "loss": 0.7037, + "mean_token_accuracy": 0.7901791334152222, + "num_tokens": 7606880.0, + "step": 198 + }, + { + "epoch": 0.025314845439511513, + "ewc_loss": 1.955777406692505e-07, + "grad_norm": 3.4054439067840576, + "learning_rate": 8.393387028401865e-08, + "loss": 0.6822, + "mean_token_accuracy": 0.7994483113288879, + "num_tokens": 7644840.0, + "step": 199 + }, + { + "epoch": 0.025442055718102024, + "ewc_loss": 2.0023435354232788e-07, + "grad_norm": 3.1248292922973633, + "learning_rate": 8.435777871979652e-08, + "loss": 0.7231, + "mean_token_accuracy": 0.7847858667373657, + "num_tokens": 7683856.0, + "step": 200 + }, + { + "epoch": 0.02556926599669253, + "ewc_loss": 2.0209699869155884e-07, + "grad_norm": 4.607271194458008, + "learning_rate": 8.47816871555744e-08, + "loss": 0.6799, + "mean_token_accuracy": 0.798133373260498, + "num_tokens": 7715306.0, + "step": 201 + }, + { + "epoch": 0.025696476275283042, + "ewc_loss": 2.0302832126617432e-07, + "grad_norm": 5.00862455368042, + "learning_rate": 8.520559559135227e-08, + "loss": 0.6412, + "mean_token_accuracy": 0.8080987930297852, + "num_tokens": 7752442.0, + "step": 202 + }, + { + "epoch": 0.025823686553873553, + "ewc_loss": 2.0675361156463623e-07, + "grad_norm": 4.26257848739624, + "learning_rate": 8.562950402713014e-08, + "loss": 0.7262, + "mean_token_accuracy": 0.7874436378479004, + "num_tokens": 7799891.0, + "step": 203 + }, + { + "epoch": 0.025950896832464064, + "ewc_loss": 2.0954757928848267e-07, + "grad_norm": 3.436009407043457, + "learning_rate": 8.605341246290801e-08, + "loss": 0.7131, + "mean_token_accuracy": 0.7856573462486267, + "num_tokens": 7830696.0, + "step": 204 + }, + { + "epoch": 0.026078107111054575, + "ewc_loss": 2.1141022443771362e-07, + "grad_norm": 3.315582036972046, + "learning_rate": 8.647732089868589e-08, + "loss": 0.6848, + "mean_token_accuracy": 0.7974880933761597, + "num_tokens": 7870968.0, + "step": 205 + }, + { + "epoch": 0.026205317389645082, + "ewc_loss": 2.1420419216156006e-07, + "grad_norm": 3.3043758869171143, + "learning_rate": 8.690122933446376e-08, + "loss": 0.7095, + "mean_token_accuracy": 0.790366530418396, + "num_tokens": 7914418.0, + "step": 206 + }, + { + "epoch": 0.026332527668235593, + "ewc_loss": 2.1886080503463745e-07, + "grad_norm": 3.3202664852142334, + "learning_rate": 8.732513777024163e-08, + "loss": 0.6468, + "mean_token_accuracy": 0.8117319345474243, + "num_tokens": 7954544.0, + "step": 207 + }, + { + "epoch": 0.026459737946826104, + "ewc_loss": 2.1886080503463745e-07, + "grad_norm": 3.2993781566619873, + "learning_rate": 8.77490462060195e-08, + "loss": 0.6956, + "mean_token_accuracy": 0.7962020635604858, + "num_tokens": 7997373.0, + "step": 208 + }, + { + "epoch": 0.026586948225416615, + "ewc_loss": 2.1886080503463745e-07, + "grad_norm": 5.183437347412109, + "learning_rate": 8.817295464179738e-08, + "loss": 0.6582, + "mean_token_accuracy": 0.7996900677680969, + "num_tokens": 8033314.0, + "step": 209 + }, + { + "epoch": 0.026714158504007122, + "ewc_loss": 2.2258609533309937e-07, + "grad_norm": 3.7334887981414795, + "learning_rate": 8.859686307757525e-08, + "loss": 0.6783, + "mean_token_accuracy": 0.79868084192276, + "num_tokens": 8069362.0, + "step": 210 + }, + { + "epoch": 0.026841368782597633, + "ewc_loss": 2.2351741790771484e-07, + "grad_norm": 3.639054775238037, + "learning_rate": 8.902077151335312e-08, + "loss": 0.7106, + "mean_token_accuracy": 0.7868783473968506, + "num_tokens": 8109126.0, + "step": 211 + }, + { + "epoch": 0.026968579061188144, + "ewc_loss": 2.2724270820617676e-07, + "grad_norm": 4.553625583648682, + "learning_rate": 8.944467994913098e-08, + "loss": 0.7639, + "mean_token_accuracy": 0.778261661529541, + "num_tokens": 8143083.0, + "step": 212 + }, + { + "epoch": 0.027095789339778655, + "ewc_loss": 2.3189932107925415e-07, + "grad_norm": 3.904433488845825, + "learning_rate": 8.986858838490885e-08, + "loss": 0.6142, + "mean_token_accuracy": 0.8106955289840698, + "num_tokens": 8176636.0, + "step": 213 + }, + { + "epoch": 0.027222999618369163, + "ewc_loss": 2.3469328880310059e-07, + "grad_norm": 3.2559709548950195, + "learning_rate": 9.029249682068673e-08, + "loss": 0.6747, + "mean_token_accuracy": 0.7980435490608215, + "num_tokens": 8220262.0, + "step": 214 + }, + { + "epoch": 0.027350209896959674, + "ewc_loss": 2.3562461137771606e-07, + "grad_norm": 5.084784030914307, + "learning_rate": 9.07164052564646e-08, + "loss": 0.6918, + "mean_token_accuracy": 0.7943786978721619, + "num_tokens": 8256303.0, + "step": 215 + }, + { + "epoch": 0.027477420175550184, + "ewc_loss": 2.3562461137771606e-07, + "grad_norm": 4.790286540985107, + "learning_rate": 9.114031369224247e-08, + "loss": 0.6587, + "mean_token_accuracy": 0.8061647415161133, + "num_tokens": 8293888.0, + "step": 216 + }, + { + "epoch": 0.027604630454140695, + "ewc_loss": 2.3655593395233154e-07, + "grad_norm": 3.477004051208496, + "learning_rate": 9.156422212802034e-08, + "loss": 0.6202, + "mean_token_accuracy": 0.8133252859115601, + "num_tokens": 8335368.0, + "step": 217 + }, + { + "epoch": 0.027731840732731206, + "ewc_loss": 2.4028122425079346e-07, + "grad_norm": 6.033039093017578, + "learning_rate": 9.198813056379822e-08, + "loss": 0.674, + "mean_token_accuracy": 0.800794243812561, + "num_tokens": 8370338.0, + "step": 218 + }, + { + "epoch": 0.027859051011321714, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 8.453643798828125, + "learning_rate": 9.241203899957609e-08, + "loss": 0.6601, + "mean_token_accuracy": 0.8023126721382141, + "num_tokens": 8409904.0, + "step": 219 + }, + { + "epoch": 0.027986261289912225, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 3.890375852584839, + "learning_rate": 9.283594743535396e-08, + "loss": 0.7869, + "mean_token_accuracy": 0.7684342861175537, + "num_tokens": 8446579.0, + "step": 220 + }, + { + "epoch": 0.028113471568502735, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 3.2349612712860107, + "learning_rate": 9.325985587113183e-08, + "loss": 0.6355, + "mean_token_accuracy": 0.810756266117096, + "num_tokens": 8483663.0, + "step": 221 + }, + { + "epoch": 0.028240681847093246, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 4.1327691078186035, + "learning_rate": 9.368376430690971e-08, + "loss": 0.664, + "mean_token_accuracy": 0.8011646270751953, + "num_tokens": 8516346.0, + "step": 222 + }, + { + "epoch": 0.028367892125683754, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 3.254533290863037, + "learning_rate": 9.410767274268758e-08, + "loss": 0.661, + "mean_token_accuracy": 0.8029942512512207, + "num_tokens": 8556914.0, + "step": 223 + }, + { + "epoch": 0.028495102404274265, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 4.464967250823975, + "learning_rate": 9.453158117846545e-08, + "loss": 0.6506, + "mean_token_accuracy": 0.8029242753982544, + "num_tokens": 8596548.0, + "step": 224 + }, + { + "epoch": 0.028622312682864776, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 3.8810577392578125, + "learning_rate": 9.495548961424333e-08, + "loss": 0.6806, + "mean_token_accuracy": 0.8005980253219604, + "num_tokens": 8634366.0, + "step": 225 + }, + { + "epoch": 0.028749522961455286, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 6.0144147872924805, + "learning_rate": 9.53793980500212e-08, + "loss": 0.6204, + "mean_token_accuracy": 0.810851514339447, + "num_tokens": 8666480.0, + "step": 226 + }, + { + "epoch": 0.028876733240045797, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 4.560459613800049, + "learning_rate": 9.580330648579907e-08, + "loss": 0.7702, + "mean_token_accuracy": 0.7765587568283081, + "num_tokens": 8705880.0, + "step": 227 + }, + { + "epoch": 0.029003943518636305, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 4.229746341705322, + "learning_rate": 9.622721492157694e-08, + "loss": 0.6318, + "mean_token_accuracy": 0.808853805065155, + "num_tokens": 8743772.0, + "step": 228 + }, + { + "epoch": 0.029131153797226816, + "ewc_loss": 2.4586915969848633e-07, + "grad_norm": 4.474792003631592, + "learning_rate": 9.665112335735482e-08, + "loss": 0.6565, + "mean_token_accuracy": 0.7997676134109497, + "num_tokens": 8775919.0, + "step": 229 + }, + { + "epoch": 0.029258364075817327, + "ewc_loss": 2.477318048477173e-07, + "grad_norm": 5.981403350830078, + "learning_rate": 9.707503179313267e-08, + "loss": 0.6051, + "mean_token_accuracy": 0.8145734071731567, + "num_tokens": 8809336.0, + "step": 230 + }, + { + "epoch": 0.029385574354407838, + "ewc_loss": 2.4586915969848633e-07, + "grad_norm": 4.134286880493164, + "learning_rate": 9.749894022891055e-08, + "loss": 0.659, + "mean_token_accuracy": 0.8018964529037476, + "num_tokens": 8845952.0, + "step": 231 + }, + { + "epoch": 0.029512784632998345, + "ewc_loss": 2.477318048477173e-07, + "grad_norm": 3.535968065261841, + "learning_rate": 9.792284866468842e-08, + "loss": 0.679, + "mean_token_accuracy": 0.7934824228286743, + "num_tokens": 8889801.0, + "step": 232 + }, + { + "epoch": 0.029639994911588856, + "ewc_loss": 2.477318048477173e-07, + "grad_norm": 4.448696136474609, + "learning_rate": 9.834675710046629e-08, + "loss": 0.6378, + "mean_token_accuracy": 0.8050020933151245, + "num_tokens": 8925429.0, + "step": 233 + }, + { + "epoch": 0.029767205190179367, + "ewc_loss": 2.4959444999694824e-07, + "grad_norm": 4.512937545776367, + "learning_rate": 9.877066553624416e-08, + "loss": 0.6713, + "mean_token_accuracy": 0.7973493933677673, + "num_tokens": 8963360.0, + "step": 234 + }, + { + "epoch": 0.029894415468769878, + "ewc_loss": 2.4959444999694824e-07, + "grad_norm": 5.85660982131958, + "learning_rate": 9.919457397202204e-08, + "loss": 0.6818, + "mean_token_accuracy": 0.7974012494087219, + "num_tokens": 8998314.0, + "step": 235 + }, + { + "epoch": 0.030021625747360385, + "ewc_loss": 2.4959444999694824e-07, + "grad_norm": 5.015904903411865, + "learning_rate": 9.961848240779991e-08, + "loss": 0.7516, + "mean_token_accuracy": 0.778224766254425, + "num_tokens": 9034804.0, + "step": 236 + }, + { + "epoch": 0.030148836025950896, + "ewc_loss": 2.5331974029541016e-07, + "grad_norm": 5.1188201904296875, + "learning_rate": 1.0004239084357778e-07, + "loss": 0.6743, + "mean_token_accuracy": 0.7993641495704651, + "num_tokens": 9070840.0, + "step": 237 + }, + { + "epoch": 0.030276046304541407, + "ewc_loss": 2.551823854446411e-07, + "grad_norm": 3.920128107070923, + "learning_rate": 1.0046629927935566e-07, + "loss": 0.6563, + "mean_token_accuracy": 0.8040991425514221, + "num_tokens": 9109370.0, + "step": 238 + }, + { + "epoch": 0.030403256583131918, + "ewc_loss": 2.5704503059387207e-07, + "grad_norm": 5.211834907531738, + "learning_rate": 1.0089020771513353e-07, + "loss": 0.6005, + "mean_token_accuracy": 0.8201383352279663, + "num_tokens": 9148191.0, + "step": 239 + }, + { + "epoch": 0.03053046686172243, + "ewc_loss": 2.5704503059387207e-07, + "grad_norm": 5.110783576965332, + "learning_rate": 1.013141161509114e-07, + "loss": 0.6669, + "mean_token_accuracy": 0.8003915548324585, + "num_tokens": 9182923.0, + "step": 240 + }, + { + "epoch": 0.030657677140312936, + "ewc_loss": 2.5890767574310303e-07, + "grad_norm": 4.949234485626221, + "learning_rate": 1.0173802458668927e-07, + "loss": 0.6663, + "mean_token_accuracy": 0.7971608638763428, + "num_tokens": 9215882.0, + "step": 241 + }, + { + "epoch": 0.030784887418903447, + "ewc_loss": 2.60770320892334e-07, + "grad_norm": 4.06166410446167, + "learning_rate": 1.0216193302246715e-07, + "loss": 0.698, + "mean_token_accuracy": 0.7864605188369751, + "num_tokens": 9249718.0, + "step": 242 + }, + { + "epoch": 0.030912097697493958, + "ewc_loss": 2.644956111907959e-07, + "grad_norm": 3.9779410362243652, + "learning_rate": 1.0258584145824502e-07, + "loss": 0.621, + "mean_token_accuracy": 0.8125342726707458, + "num_tokens": 9291146.0, + "step": 243 + }, + { + "epoch": 0.03103930797608447, + "ewc_loss": 2.60770320892334e-07, + "grad_norm": 5.464099407196045, + "learning_rate": 1.0300974989402289e-07, + "loss": 0.6124, + "mean_token_accuracy": 0.8128499984741211, + "num_tokens": 9326403.0, + "step": 244 + }, + { + "epoch": 0.031166518254674976, + "ewc_loss": 2.6263296604156494e-07, + "grad_norm": 3.6519393920898438, + "learning_rate": 1.0343365832980076e-07, + "loss": 0.6657, + "mean_token_accuracy": 0.800825297832489, + "num_tokens": 9368491.0, + "step": 245 + }, + { + "epoch": 0.03129372853326549, + "ewc_loss": 2.644956111907959e-07, + "grad_norm": 3.391523838043213, + "learning_rate": 1.0385756676557864e-07, + "loss": 0.661, + "mean_token_accuracy": 0.8024548292160034, + "num_tokens": 9409041.0, + "step": 246 + }, + { + "epoch": 0.031420938811855995, + "ewc_loss": 2.905726432800293e-07, + "grad_norm": 16.835796356201172, + "learning_rate": 1.0428147520135651e-07, + "loss": 0.6342, + "mean_token_accuracy": 0.8092589378356934, + "num_tokens": 9448591.0, + "step": 247 + }, + { + "epoch": 0.03154814909044651, + "ewc_loss": 2.7194619178771973e-07, + "grad_norm": 6.009798526763916, + "learning_rate": 1.0470538363713437e-07, + "loss": 0.6648, + "mean_token_accuracy": 0.7987100481987, + "num_tokens": 9486604.0, + "step": 248 + }, + { + "epoch": 0.031675359369037016, + "ewc_loss": 2.7567148208618164e-07, + "grad_norm": 6.332827091217041, + "learning_rate": 1.0512929207291224e-07, + "loss": 0.6705, + "mean_token_accuracy": 0.7942172288894653, + "num_tokens": 9525285.0, + "step": 249 + }, + { + "epoch": 0.03180256964762753, + "ewc_loss": 2.7194619178771973e-07, + "grad_norm": 7.969474792480469, + "learning_rate": 1.0555320050869011e-07, + "loss": 0.6512, + "mean_token_accuracy": 0.7993592023849487, + "num_tokens": 9557898.0, + "step": 250 + }, + { + "epoch": 0.03192977992621804, + "ewc_loss": 2.738088369369507e-07, + "grad_norm": 5.321089267730713, + "learning_rate": 1.0597710894446799e-07, + "loss": 0.7102, + "mean_token_accuracy": 0.7824714183807373, + "num_tokens": 9590438.0, + "step": 251 + }, + { + "epoch": 0.032056990204808546, + "ewc_loss": 2.7194619178771973e-07, + "grad_norm": 4.097842216491699, + "learning_rate": 1.0640101738024586e-07, + "loss": 0.6216, + "mean_token_accuracy": 0.8118769526481628, + "num_tokens": 9631402.0, + "step": 252 + }, + { + "epoch": 0.03218420048339906, + "ewc_loss": 2.7194619178771973e-07, + "grad_norm": 3.8638081550598145, + "learning_rate": 1.0682492581602373e-07, + "loss": 0.6017, + "mean_token_accuracy": 0.8181278705596924, + "num_tokens": 9663702.0, + "step": 253 + }, + { + "epoch": 0.03231141076198957, + "ewc_loss": 2.7008354663848877e-07, + "grad_norm": 5.312977313995361, + "learning_rate": 1.072488342518016e-07, + "loss": 0.5905, + "mean_token_accuracy": 0.8177251815795898, + "num_tokens": 9698969.0, + "step": 254 + }, + { + "epoch": 0.03243862104058008, + "ewc_loss": 2.738088369369507e-07, + "grad_norm": 4.569464206695557, + "learning_rate": 1.0767274268757948e-07, + "loss": 0.6604, + "mean_token_accuracy": 0.7989878058433533, + "num_tokens": 9738075.0, + "step": 255 + }, + { + "epoch": 0.03256583131917059, + "ewc_loss": 2.7567148208618164e-07, + "grad_norm": 4.576035499572754, + "learning_rate": 1.0809665112335735e-07, + "loss": 0.6652, + "mean_token_accuracy": 0.8013838529586792, + "num_tokens": 9782675.0, + "step": 256 + }, + { + "epoch": 0.0326930415977611, + "ewc_loss": 2.7567148208618164e-07, + "grad_norm": 4.454951763153076, + "learning_rate": 1.0852055955913522e-07, + "loss": 0.5652, + "mean_token_accuracy": 0.8266099095344543, + "num_tokens": 9820585.0, + "step": 257 + }, + { + "epoch": 0.03282025187635161, + "ewc_loss": 2.7567148208618164e-07, + "grad_norm": 4.177799701690674, + "learning_rate": 1.089444679949131e-07, + "loss": 0.6898, + "mean_token_accuracy": 0.7921759486198425, + "num_tokens": 9860693.0, + "step": 258 + }, + { + "epoch": 0.03294746215494212, + "ewc_loss": 2.738088369369507e-07, + "grad_norm": 5.216511249542236, + "learning_rate": 1.0936837643069097e-07, + "loss": 0.6234, + "mean_token_accuracy": 0.8114327192306519, + "num_tokens": 9902629.0, + "step": 259 + }, + { + "epoch": 0.03307467243353263, + "ewc_loss": 2.7939677238464355e-07, + "grad_norm": 3.747825860977173, + "learning_rate": 1.0979228486646884e-07, + "loss": 0.5949, + "mean_token_accuracy": 0.8157461285591125, + "num_tokens": 9937304.0, + "step": 260 + }, + { + "epoch": 0.03320188271212314, + "ewc_loss": 2.812594175338745e-07, + "grad_norm": 5.638525009155273, + "learning_rate": 1.1021619330224671e-07, + "loss": 0.7282, + "mean_token_accuracy": 0.7817624807357788, + "num_tokens": 9976911.0, + "step": 261 + }, + { + "epoch": 0.03332909299071365, + "ewc_loss": 2.849847078323364e-07, + "grad_norm": 4.098443508148193, + "learning_rate": 1.1064010173802458e-07, + "loss": 0.6628, + "mean_token_accuracy": 0.8007690906524658, + "num_tokens": 10015740.0, + "step": 262 + }, + { + "epoch": 0.03345630326930416, + "ewc_loss": 2.8312206268310547e-07, + "grad_norm": 3.677586078643799, + "learning_rate": 1.1106401017380246e-07, + "loss": 0.5997, + "mean_token_accuracy": 0.818276047706604, + "num_tokens": 10053054.0, + "step": 263 + }, + { + "epoch": 0.03358351354789467, + "ewc_loss": 2.8312206268310547e-07, + "grad_norm": 7.791445255279541, + "learning_rate": 1.1148791860958033e-07, + "loss": 0.6952, + "mean_token_accuracy": 0.7861472368240356, + "num_tokens": 10082088.0, + "step": 264 + }, + { + "epoch": 0.03371072382648518, + "ewc_loss": 2.868473529815674e-07, + "grad_norm": 4.413455486297607, + "learning_rate": 1.119118270453582e-07, + "loss": 0.6693, + "mean_token_accuracy": 0.797116756439209, + "num_tokens": 10121122.0, + "step": 265 + }, + { + "epoch": 0.03383793410507569, + "ewc_loss": 2.868473529815674e-07, + "grad_norm": 4.378622531890869, + "learning_rate": 1.1233573548113607e-07, + "loss": 0.6632, + "mean_token_accuracy": 0.8011453747749329, + "num_tokens": 10161017.0, + "step": 266 + }, + { + "epoch": 0.0339651443836662, + "ewc_loss": 2.868473529815674e-07, + "grad_norm": 3.7914388179779053, + "learning_rate": 1.1275964391691393e-07, + "loss": 0.6303, + "mean_token_accuracy": 0.8103694915771484, + "num_tokens": 10204913.0, + "step": 267 + }, + { + "epoch": 0.03409235466225671, + "ewc_loss": 2.8870999813079834e-07, + "grad_norm": 4.173420429229736, + "learning_rate": 1.131835523526918e-07, + "loss": 0.6781, + "mean_token_accuracy": 0.7924284338951111, + "num_tokens": 10245154.0, + "step": 268 + }, + { + "epoch": 0.03421956494084722, + "ewc_loss": 2.905726432800293e-07, + "grad_norm": 3.691335916519165, + "learning_rate": 1.1360746078846968e-07, + "loss": 0.685, + "mean_token_accuracy": 0.7928832769393921, + "num_tokens": 10283006.0, + "step": 269 + }, + { + "epoch": 0.03434677521943773, + "ewc_loss": 2.905726432800293e-07, + "grad_norm": 5.489937782287598, + "learning_rate": 1.1403136922424755e-07, + "loss": 0.644, + "mean_token_accuracy": 0.8039697408676147, + "num_tokens": 10316689.0, + "step": 270 + }, + { + "epoch": 0.03447398549802824, + "ewc_loss": 2.905726432800293e-07, + "grad_norm": 4.997364521026611, + "learning_rate": 1.1445527766002542e-07, + "loss": 0.6391, + "mean_token_accuracy": 0.8035380840301514, + "num_tokens": 10358145.0, + "step": 271 + }, + { + "epoch": 0.03460119577661875, + "ewc_loss": 2.942979335784912e-07, + "grad_norm": 5.809206008911133, + "learning_rate": 1.148791860958033e-07, + "loss": 0.7394, + "mean_token_accuracy": 0.777178168296814, + "num_tokens": 10393424.0, + "step": 272 + }, + { + "epoch": 0.034728406055209264, + "ewc_loss": 2.998858690261841e-07, + "grad_norm": 3.7600691318511963, + "learning_rate": 1.1530309453158117e-07, + "loss": 0.6543, + "mean_token_accuracy": 0.8012163639068604, + "num_tokens": 10433560.0, + "step": 273 + }, + { + "epoch": 0.03485561633379977, + "ewc_loss": 2.980232238769531e-07, + "grad_norm": 4.350419044494629, + "learning_rate": 1.1572700296735904e-07, + "loss": 0.6501, + "mean_token_accuracy": 0.8010105490684509, + "num_tokens": 10472024.0, + "step": 274 + }, + { + "epoch": 0.03498282661239028, + "ewc_loss": 2.998858690261841e-07, + "grad_norm": 4.253203392028809, + "learning_rate": 1.1615091140313691e-07, + "loss": 0.6886, + "mean_token_accuracy": 0.7914829254150391, + "num_tokens": 10510874.0, + "step": 275 + }, + { + "epoch": 0.03511003689098079, + "ewc_loss": 2.998858690261841e-07, + "grad_norm": 4.788109302520752, + "learning_rate": 1.1657481983891479e-07, + "loss": 0.6267, + "mean_token_accuracy": 0.8096905946731567, + "num_tokens": 10552411.0, + "step": 276 + }, + { + "epoch": 0.0352372471695713, + "ewc_loss": 2.998858690261841e-07, + "grad_norm": 4.935317516326904, + "learning_rate": 1.1699872827469266e-07, + "loss": 0.6432, + "mean_token_accuracy": 0.80622398853302, + "num_tokens": 10591537.0, + "step": 277 + }, + { + "epoch": 0.03536445744816181, + "ewc_loss": 3.03611159324646e-07, + "grad_norm": 4.584756374359131, + "learning_rate": 1.1742263671047053e-07, + "loss": 0.651, + "mean_token_accuracy": 0.8034168481826782, + "num_tokens": 10632228.0, + "step": 278 + }, + { + "epoch": 0.03549166772675232, + "ewc_loss": 3.073364496231079e-07, + "grad_norm": 7.101899147033691, + "learning_rate": 1.178465451462484e-07, + "loss": 0.7071, + "mean_token_accuracy": 0.7844976782798767, + "num_tokens": 10672829.0, + "step": 279 + }, + { + "epoch": 0.03561887800534283, + "ewc_loss": 3.129243850708008e-07, + "grad_norm": 6.18212890625, + "learning_rate": 1.1827045358202628e-07, + "loss": 0.6805, + "mean_token_accuracy": 0.7998534440994263, + "num_tokens": 10706146.0, + "step": 280 + }, + { + "epoch": 0.035746088283933344, + "ewc_loss": 3.1478703022003174e-07, + "grad_norm": 5.43107795715332, + "learning_rate": 1.1869436201780415e-07, + "loss": 0.5977, + "mean_token_accuracy": 0.8192282915115356, + "num_tokens": 10744112.0, + "step": 281 + }, + { + "epoch": 0.03587329856252385, + "ewc_loss": 3.1478703022003174e-07, + "grad_norm": 5.961733818054199, + "learning_rate": 1.1911827045358202e-07, + "loss": 0.628, + "mean_token_accuracy": 0.8132067322731018, + "num_tokens": 10783726.0, + "step": 282 + }, + { + "epoch": 0.03600050884111436, + "ewc_loss": 3.2223761081695557e-07, + "grad_norm": 4.970330715179443, + "learning_rate": 1.195421788893599e-07, + "loss": 0.6924, + "mean_token_accuracy": 0.7936382293701172, + "num_tokens": 10824476.0, + "step": 283 + }, + { + "epoch": 0.036127719119704874, + "ewc_loss": 3.2223761081695557e-07, + "grad_norm": 5.844291687011719, + "learning_rate": 1.1996608732513778e-07, + "loss": 0.6517, + "mean_token_accuracy": 0.8025597333908081, + "num_tokens": 10859997.0, + "step": 284 + }, + { + "epoch": 0.03625492939829538, + "ewc_loss": 3.2223761081695557e-07, + "grad_norm": 6.106836318969727, + "learning_rate": 1.2038999576091563e-07, + "loss": 0.6228, + "mean_token_accuracy": 0.8119120597839355, + "num_tokens": 10898800.0, + "step": 285 + }, + { + "epoch": 0.036382139676885895, + "ewc_loss": 3.241002559661865e-07, + "grad_norm": 5.3491435050964355, + "learning_rate": 1.208139041966935e-07, + "loss": 0.5887, + "mean_token_accuracy": 0.8220880031585693, + "num_tokens": 10937462.0, + "step": 286 + }, + { + "epoch": 0.0365093499554764, + "ewc_loss": 3.296881914138794e-07, + "grad_norm": 4.975236892700195, + "learning_rate": 1.2123781263247137e-07, + "loss": 0.649, + "mean_token_accuracy": 0.7995718717575073, + "num_tokens": 10970739.0, + "step": 287 + }, + { + "epoch": 0.03663656023406691, + "ewc_loss": 3.3155083656311035e-07, + "grad_norm": 5.345465183258057, + "learning_rate": 1.2166172106824924e-07, + "loss": 0.702, + "mean_token_accuracy": 0.7948175668716431, + "num_tokens": 11003000.0, + "step": 288 + }, + { + "epoch": 0.036763770512657425, + "ewc_loss": 3.296881914138794e-07, + "grad_norm": 5.08404016494751, + "learning_rate": 1.2208562950402712e-07, + "loss": 0.607, + "mean_token_accuracy": 0.8124017119407654, + "num_tokens": 11039665.0, + "step": 289 + }, + { + "epoch": 0.03689098079124793, + "ewc_loss": 3.3527612686157227e-07, + "grad_norm": 4.262539386749268, + "learning_rate": 1.22509537939805e-07, + "loss": 0.6341, + "mean_token_accuracy": 0.8023197650909424, + "num_tokens": 11078368.0, + "step": 290 + }, + { + "epoch": 0.03701819106983844, + "ewc_loss": 3.371387720108032e-07, + "grad_norm": 3.6914761066436768, + "learning_rate": 1.2293344637558286e-07, + "loss": 0.6599, + "mean_token_accuracy": 0.8008292317390442, + "num_tokens": 11122654.0, + "step": 291 + }, + { + "epoch": 0.037145401348428954, + "ewc_loss": 3.371387720108032e-07, + "grad_norm": 4.700908184051514, + "learning_rate": 1.2335735481136073e-07, + "loss": 0.6145, + "mean_token_accuracy": 0.8071224689483643, + "num_tokens": 11162622.0, + "step": 292 + }, + { + "epoch": 0.03727261162701946, + "ewc_loss": 3.390014171600342e-07, + "grad_norm": 5.767668724060059, + "learning_rate": 1.237812632471386e-07, + "loss": 0.6263, + "mean_token_accuracy": 0.8100391030311584, + "num_tokens": 11202564.0, + "step": 293 + }, + { + "epoch": 0.037399821905609976, + "ewc_loss": 3.390014171600342e-07, + "grad_norm": 4.975704669952393, + "learning_rate": 1.2420517168291648e-07, + "loss": 0.6443, + "mean_token_accuracy": 0.8006953597068787, + "num_tokens": 11243422.0, + "step": 294 + }, + { + "epoch": 0.03752703218420048, + "ewc_loss": 3.4086406230926514e-07, + "grad_norm": 4.865363597869873, + "learning_rate": 1.2462908011869435e-07, + "loss": 0.6261, + "mean_token_accuracy": 0.8066008687019348, + "num_tokens": 11280867.0, + "step": 295 + }, + { + "epoch": 0.03765424246279099, + "ewc_loss": 3.46451997756958e-07, + "grad_norm": 6.814487934112549, + "learning_rate": 1.2505298855447223e-07, + "loss": 0.5755, + "mean_token_accuracy": 0.8232936859130859, + "num_tokens": 11318454.0, + "step": 296 + }, + { + "epoch": 0.037781452741381505, + "ewc_loss": 3.4831464290618896e-07, + "grad_norm": 5.453667640686035, + "learning_rate": 1.254768969902501e-07, + "loss": 0.7154, + "mean_token_accuracy": 0.7849298119544983, + "num_tokens": 11354721.0, + "step": 297 + }, + { + "epoch": 0.03790866301997201, + "ewc_loss": 3.501772880554199e-07, + "grad_norm": 5.350839614868164, + "learning_rate": 1.2590080542602797e-07, + "loss": 0.6726, + "mean_token_accuracy": 0.7963230609893799, + "num_tokens": 11385938.0, + "step": 298 + }, + { + "epoch": 0.03803587329856253, + "ewc_loss": 3.5390257835388184e-07, + "grad_norm": 5.218733310699463, + "learning_rate": 1.2632471386180584e-07, + "loss": 0.6527, + "mean_token_accuracy": 0.8009493350982666, + "num_tokens": 11424911.0, + "step": 299 + }, + { + "epoch": 0.038163083577153034, + "ewc_loss": 3.5390257835388184e-07, + "grad_norm": 4.265783309936523, + "learning_rate": 1.2674862229758372e-07, + "loss": 0.6211, + "mean_token_accuracy": 0.8110189437866211, + "num_tokens": 11460580.0, + "step": 300 + }, + { + "epoch": 0.03829029385574354, + "ewc_loss": 3.557652235031128e-07, + "grad_norm": 6.070849895477295, + "learning_rate": 1.271725307333616e-07, + "loss": 0.6117, + "mean_token_accuracy": 0.8146043419837952, + "num_tokens": 11501031.0, + "step": 301 + }, + { + "epoch": 0.038417504134334056, + "ewc_loss": 3.557652235031128e-07, + "grad_norm": 4.3286943435668945, + "learning_rate": 1.2759643916913946e-07, + "loss": 0.6211, + "mean_token_accuracy": 0.8103044033050537, + "num_tokens": 11544053.0, + "step": 302 + }, + { + "epoch": 0.03854471441292456, + "ewc_loss": 3.557652235031128e-07, + "grad_norm": 5.924989223480225, + "learning_rate": 1.2802034760491733e-07, + "loss": 0.6369, + "mean_token_accuracy": 0.805304765701294, + "num_tokens": 11576944.0, + "step": 303 + }, + { + "epoch": 0.03867192469151508, + "ewc_loss": 3.557652235031128e-07, + "grad_norm": 3.7516777515411377, + "learning_rate": 1.284442560406952e-07, + "loss": 0.6871, + "mean_token_accuracy": 0.7905560731887817, + "num_tokens": 11619735.0, + "step": 304 + }, + { + "epoch": 0.038799134970105585, + "ewc_loss": 3.557652235031128e-07, + "grad_norm": 5.245358467102051, + "learning_rate": 1.2886816447647308e-07, + "loss": 0.7237, + "mean_token_accuracy": 0.7777252197265625, + "num_tokens": 11655025.0, + "step": 305 + }, + { + "epoch": 0.03892634524869609, + "ewc_loss": 3.594905138015747e-07, + "grad_norm": 5.115804195404053, + "learning_rate": 1.2929207291225095e-07, + "loss": 0.6216, + "mean_token_accuracy": 0.8113612532615662, + "num_tokens": 11691970.0, + "step": 306 + }, + { + "epoch": 0.03905355552728661, + "ewc_loss": 3.632158041000366e-07, + "grad_norm": 5.506860733032227, + "learning_rate": 1.2971598134802882e-07, + "loss": 0.6373, + "mean_token_accuracy": 0.8048353791236877, + "num_tokens": 11729561.0, + "step": 307 + }, + { + "epoch": 0.039180765805877114, + "ewc_loss": 3.650784492492676e-07, + "grad_norm": 4.9837117195129395, + "learning_rate": 1.301398897838067e-07, + "loss": 0.656, + "mean_token_accuracy": 0.797944962978363, + "num_tokens": 11765707.0, + "step": 308 + }, + { + "epoch": 0.03930797608446762, + "ewc_loss": 3.650784492492676e-07, + "grad_norm": 4.3857550621032715, + "learning_rate": 1.3056379821958457e-07, + "loss": 0.6647, + "mean_token_accuracy": 0.8025667667388916, + "num_tokens": 11806223.0, + "step": 309 + }, + { + "epoch": 0.039435186363058136, + "ewc_loss": 3.725290298461914e-07, + "grad_norm": 5.353142261505127, + "learning_rate": 1.3098770665536244e-07, + "loss": 0.6203, + "mean_token_accuracy": 0.8120870590209961, + "num_tokens": 11845477.0, + "step": 310 + }, + { + "epoch": 0.039562396641648644, + "ewc_loss": 3.7439167499542236e-07, + "grad_norm": 7.136176109313965, + "learning_rate": 1.3141161509114031e-07, + "loss": 0.6047, + "mean_token_accuracy": 0.8101341724395752, + "num_tokens": 11877973.0, + "step": 311 + }, + { + "epoch": 0.03968960692023916, + "ewc_loss": 3.7439167499542236e-07, + "grad_norm": 4.621185302734375, + "learning_rate": 1.3183552352691819e-07, + "loss": 0.6104, + "mean_token_accuracy": 0.8131834268569946, + "num_tokens": 11912093.0, + "step": 312 + }, + { + "epoch": 0.039816817198829665, + "ewc_loss": 3.7439167499542236e-07, + "grad_norm": 4.499348163604736, + "learning_rate": 1.3225943196269603e-07, + "loss": 0.6366, + "mean_token_accuracy": 0.8096308708190918, + "num_tokens": 11952541.0, + "step": 313 + }, + { + "epoch": 0.03994402747742017, + "ewc_loss": 3.7997961044311523e-07, + "grad_norm": 4.6406097412109375, + "learning_rate": 1.3268334039847393e-07, + "loss": 0.6136, + "mean_token_accuracy": 0.810263454914093, + "num_tokens": 11990414.0, + "step": 314 + }, + { + "epoch": 0.04007123775601069, + "ewc_loss": 3.7997961044311523e-07, + "grad_norm": 6.485106945037842, + "learning_rate": 1.3310724883425178e-07, + "loss": 0.6811, + "mean_token_accuracy": 0.7901315689086914, + "num_tokens": 12025925.0, + "step": 315 + }, + { + "epoch": 0.040198448034601195, + "ewc_loss": 3.8370490074157715e-07, + "grad_norm": 5.153565883636475, + "learning_rate": 1.3353115727002968e-07, + "loss": 0.6488, + "mean_token_accuracy": 0.8015759587287903, + "num_tokens": 12068879.0, + "step": 316 + }, + { + "epoch": 0.04032565831319171, + "ewc_loss": 3.8370490074157715e-07, + "grad_norm": 4.7693634033203125, + "learning_rate": 1.3395506570580752e-07, + "loss": 0.6152, + "mean_token_accuracy": 0.8109124898910522, + "num_tokens": 12111931.0, + "step": 317 + }, + { + "epoch": 0.040452868591782216, + "ewc_loss": 3.855675458908081e-07, + "grad_norm": 5.229496479034424, + "learning_rate": 1.3437897414158542e-07, + "loss": 0.682, + "mean_token_accuracy": 0.7901155948638916, + "num_tokens": 12140431.0, + "step": 318 + }, + { + "epoch": 0.040580078870372724, + "ewc_loss": 3.8929283618927e-07, + "grad_norm": 4.308361530303955, + "learning_rate": 1.3480288257736327e-07, + "loss": 0.6124, + "mean_token_accuracy": 0.8111217617988586, + "num_tokens": 12173932.0, + "step": 319 + }, + { + "epoch": 0.04070728914896324, + "ewc_loss": 3.91155481338501e-07, + "grad_norm": 4.3077239990234375, + "learning_rate": 1.3522679101314117e-07, + "loss": 0.5896, + "mean_token_accuracy": 0.8163239359855652, + "num_tokens": 12210040.0, + "step": 320 + }, + { + "epoch": 0.040834499427553746, + "ewc_loss": 3.9301812648773193e-07, + "grad_norm": 5.344631671905518, + "learning_rate": 1.35650699448919e-07, + "loss": 0.6233, + "mean_token_accuracy": 0.809738278388977, + "num_tokens": 12248556.0, + "step": 321 + }, + { + "epoch": 0.04096170970614425, + "ewc_loss": 3.9674341678619385e-07, + "grad_norm": 4.709206581115723, + "learning_rate": 1.360746078846969e-07, + "loss": 0.6122, + "mean_token_accuracy": 0.8106123208999634, + "num_tokens": 12285664.0, + "step": 322 + }, + { + "epoch": 0.04108891998473477, + "ewc_loss": 3.948807716369629e-07, + "grad_norm": 4.97543478012085, + "learning_rate": 1.3649851632047476e-07, + "loss": 0.6653, + "mean_token_accuracy": 0.7967453002929688, + "num_tokens": 12326964.0, + "step": 323 + }, + { + "epoch": 0.041216130263325275, + "ewc_loss": 3.986060619354248e-07, + "grad_norm": 3.7609426975250244, + "learning_rate": 1.3692242475625266e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.8255643844604492, + "num_tokens": 12366541.0, + "step": 324 + }, + { + "epoch": 0.04134334054191579, + "ewc_loss": 3.986060619354248e-07, + "grad_norm": 5.023813247680664, + "learning_rate": 1.373463331920305e-07, + "loss": 0.6367, + "mean_token_accuracy": 0.8094503879547119, + "num_tokens": 12405664.0, + "step": 325 + }, + { + "epoch": 0.0414705508205063, + "ewc_loss": 4.0046870708465576e-07, + "grad_norm": 5.865102291107178, + "learning_rate": 1.377702416278084e-07, + "loss": 0.7002, + "mean_token_accuracy": 0.78751540184021, + "num_tokens": 12445039.0, + "step": 326 + }, + { + "epoch": 0.041597761099096804, + "ewc_loss": 4.041939973831177e-07, + "grad_norm": 4.5191473960876465, + "learning_rate": 1.3819415006358625e-07, + "loss": 0.5703, + "mean_token_accuracy": 0.8243749141693115, + "num_tokens": 12485481.0, + "step": 327 + }, + { + "epoch": 0.04172497137768732, + "ewc_loss": 4.041939973831177e-07, + "grad_norm": 4.195931434631348, + "learning_rate": 1.3861805849936415e-07, + "loss": 0.5949, + "mean_token_accuracy": 0.8198784589767456, + "num_tokens": 12530272.0, + "step": 328 + }, + { + "epoch": 0.041852181656277826, + "ewc_loss": 4.0605664253234863e-07, + "grad_norm": 4.897982120513916, + "learning_rate": 1.39041966935142e-07, + "loss": 0.6402, + "mean_token_accuracy": 0.8029866814613342, + "num_tokens": 12565854.0, + "step": 329 + }, + { + "epoch": 0.04197939193486834, + "ewc_loss": 4.0605664253234863e-07, + "grad_norm": 3.8281409740448, + "learning_rate": 1.394658753709199e-07, + "loss": 0.6265, + "mean_token_accuracy": 0.8092924356460571, + "num_tokens": 12607313.0, + "step": 330 + }, + { + "epoch": 0.04210660221345885, + "ewc_loss": 4.0605664253234863e-07, + "grad_norm": 4.530788898468018, + "learning_rate": 1.3988978380669774e-07, + "loss": 0.6641, + "mean_token_accuracy": 0.7977468967437744, + "num_tokens": 12642057.0, + "step": 331 + }, + { + "epoch": 0.042233812492049355, + "ewc_loss": 4.0605664253234863e-07, + "grad_norm": 3.6626508235931396, + "learning_rate": 1.403136922424756e-07, + "loss": 0.6556, + "mean_token_accuracy": 0.7993099689483643, + "num_tokens": 12684118.0, + "step": 332 + }, + { + "epoch": 0.04236102277063987, + "ewc_loss": 4.0978193283081055e-07, + "grad_norm": 4.755073070526123, + "learning_rate": 1.4073760067825348e-07, + "loss": 0.6201, + "mean_token_accuracy": 0.8089624643325806, + "num_tokens": 12721945.0, + "step": 333 + }, + { + "epoch": 0.04248823304923038, + "ewc_loss": 4.1909515857696533e-07, + "grad_norm": 4.903878688812256, + "learning_rate": 1.4116150911403136e-07, + "loss": 0.635, + "mean_token_accuracy": 0.8048193454742432, + "num_tokens": 12764175.0, + "step": 334 + }, + { + "epoch": 0.04261544332782089, + "ewc_loss": 4.209578037261963e-07, + "grad_norm": 4.669766902923584, + "learning_rate": 1.4158541754980923e-07, + "loss": 0.5848, + "mean_token_accuracy": 0.821810781955719, + "num_tokens": 12801600.0, + "step": 335 + }, + { + "epoch": 0.0427426536064114, + "ewc_loss": 4.209578037261963e-07, + "grad_norm": 4.77014684677124, + "learning_rate": 1.420093259855871e-07, + "loss": 0.6168, + "mean_token_accuracy": 0.8161858320236206, + "num_tokens": 12843584.0, + "step": 336 + }, + { + "epoch": 0.042869863885001906, + "ewc_loss": 4.2654573917388916e-07, + "grad_norm": 5.6850972175598145, + "learning_rate": 1.4243323442136497e-07, + "loss": 0.6312, + "mean_token_accuracy": 0.8070085048675537, + "num_tokens": 12878105.0, + "step": 337 + }, + { + "epoch": 0.04299707416359242, + "ewc_loss": 4.2654573917388916e-07, + "grad_norm": 4.201358795166016, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.6378, + "mean_token_accuracy": 0.8058640956878662, + "num_tokens": 12911712.0, + "step": 338 + }, + { + "epoch": 0.04312428444218293, + "ewc_loss": 4.2654573917388916e-07, + "grad_norm": 3.989107847213745, + "learning_rate": 1.4328105129292072e-07, + "loss": 0.6192, + "mean_token_accuracy": 0.8117160797119141, + "num_tokens": 12951303.0, + "step": 339 + }, + { + "epoch": 0.043251494720773435, + "ewc_loss": 4.3213367462158203e-07, + "grad_norm": 5.2876386642456055, + "learning_rate": 1.437049597286986e-07, + "loss": 0.6064, + "mean_token_accuracy": 0.8143645524978638, + "num_tokens": 12990544.0, + "step": 340 + }, + { + "epoch": 0.04337870499936395, + "ewc_loss": 4.33996319770813e-07, + "grad_norm": 5.211333751678467, + "learning_rate": 1.4412886816447646e-07, + "loss": 0.664, + "mean_token_accuracy": 0.7978432178497314, + "num_tokens": 13023766.0, + "step": 341 + }, + { + "epoch": 0.04350591527795446, + "ewc_loss": 4.33996319770813e-07, + "grad_norm": 4.0781402587890625, + "learning_rate": 1.4455277660025434e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.8220751285552979, + "num_tokens": 13056337.0, + "step": 342 + }, + { + "epoch": 0.04363312555654497, + "ewc_loss": 4.33996319770813e-07, + "grad_norm": 6.279749870300293, + "learning_rate": 1.449766850360322e-07, + "loss": 0.6379, + "mean_token_accuracy": 0.8055137395858765, + "num_tokens": 13097345.0, + "step": 343 + }, + { + "epoch": 0.04376033583513548, + "ewc_loss": 4.3585896492004395e-07, + "grad_norm": 6.774698257446289, + "learning_rate": 1.4540059347181008e-07, + "loss": 0.6219, + "mean_token_accuracy": 0.8085880279541016, + "num_tokens": 13127332.0, + "step": 344 + }, + { + "epoch": 0.043887546113725986, + "ewc_loss": 4.3585896492004395e-07, + "grad_norm": 3.7321348190307617, + "learning_rate": 1.4582450190758795e-07, + "loss": 0.6514, + "mean_token_accuracy": 0.7994736433029175, + "num_tokens": 13170760.0, + "step": 345 + }, + { + "epoch": 0.0440147563923165, + "ewc_loss": 4.3958425521850586e-07, + "grad_norm": 5.296533584594727, + "learning_rate": 1.4624841034336583e-07, + "loss": 0.6194, + "mean_token_accuracy": 0.8127514719963074, + "num_tokens": 13207061.0, + "step": 346 + }, + { + "epoch": 0.04414196667090701, + "ewc_loss": 4.414469003677368e-07, + "grad_norm": 4.037265777587891, + "learning_rate": 1.466723187791437e-07, + "loss": 0.5891, + "mean_token_accuracy": 0.8192697763442993, + "num_tokens": 13247903.0, + "step": 347 + }, + { + "epoch": 0.04426917694949752, + "ewc_loss": 4.4517219066619873e-07, + "grad_norm": 3.834005117416382, + "learning_rate": 1.4709622721492157e-07, + "loss": 0.5996, + "mean_token_accuracy": 0.8120604157447815, + "num_tokens": 13287193.0, + "step": 348 + }, + { + "epoch": 0.04439638722808803, + "ewc_loss": 4.4517219066619873e-07, + "grad_norm": 5.036059379577637, + "learning_rate": 1.4752013565069942e-07, + "loss": 0.6037, + "mean_token_accuracy": 0.8154102563858032, + "num_tokens": 13323112.0, + "step": 349 + }, + { + "epoch": 0.04452359750667854, + "ewc_loss": 4.507601261138916e-07, + "grad_norm": 4.799718856811523, + "learning_rate": 1.4794404408647732e-07, + "loss": 0.5996, + "mean_token_accuracy": 0.817146897315979, + "num_tokens": 13359653.0, + "step": 350 + }, + { + "epoch": 0.04465080778526905, + "ewc_loss": 4.544854164123535e-07, + "grad_norm": 3.8121910095214844, + "learning_rate": 1.4836795252225516e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.8220512270927429, + "num_tokens": 13400571.0, + "step": 351 + }, + { + "epoch": 0.04477801806385956, + "ewc_loss": 4.5262277126312256e-07, + "grad_norm": 4.481450080871582, + "learning_rate": 1.4879186095803306e-07, + "loss": 0.5477, + "mean_token_accuracy": 0.8264718055725098, + "num_tokens": 13437656.0, + "step": 352 + }, + { + "epoch": 0.04490522834245007, + "ewc_loss": 4.5634806156158447e-07, + "grad_norm": 4.367088794708252, + "learning_rate": 1.492157693938109e-07, + "loss": 0.6058, + "mean_token_accuracy": 0.8135788440704346, + "num_tokens": 13474688.0, + "step": 353 + }, + { + "epoch": 0.04503243862104058, + "ewc_loss": 4.600733518600464e-07, + "grad_norm": 3.824073553085327, + "learning_rate": 1.496396778295888e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.830457329750061, + "num_tokens": 13518256.0, + "step": 354 + }, + { + "epoch": 0.04515964889963109, + "ewc_loss": 4.6193599700927734e-07, + "grad_norm": 4.501540660858154, + "learning_rate": 1.5006358626536665e-07, + "loss": 0.6311, + "mean_token_accuracy": 0.8082520961761475, + "num_tokens": 13554315.0, + "step": 355 + }, + { + "epoch": 0.0452868591782216, + "ewc_loss": 4.6193599700927734e-07, + "grad_norm": 3.549834728240967, + "learning_rate": 1.5048749470114455e-07, + "loss": 0.6518, + "mean_token_accuracy": 0.8018648624420166, + "num_tokens": 13594388.0, + "step": 356 + }, + { + "epoch": 0.04541406945681211, + "ewc_loss": 4.637986421585083e-07, + "grad_norm": 4.267528533935547, + "learning_rate": 1.509114031369224e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8182199001312256, + "num_tokens": 13633704.0, + "step": 357 + }, + { + "epoch": 0.04554127973540262, + "ewc_loss": 4.675239324569702e-07, + "grad_norm": 4.0852370262146, + "learning_rate": 1.513353115727003e-07, + "loss": 0.6276, + "mean_token_accuracy": 0.8034470081329346, + "num_tokens": 13671183.0, + "step": 358 + }, + { + "epoch": 0.04566849001399313, + "ewc_loss": 4.7124922275543213e-07, + "grad_norm": 4.5157470703125, + "learning_rate": 1.5175922000847814e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8217211961746216, + "num_tokens": 13709640.0, + "step": 359 + }, + { + "epoch": 0.04579570029258364, + "ewc_loss": 4.7124922275543213e-07, + "grad_norm": 4.530844688415527, + "learning_rate": 1.5218312844425604e-07, + "loss": 0.606, + "mean_token_accuracy": 0.8073323369026184, + "num_tokens": 13751114.0, + "step": 360 + }, + { + "epoch": 0.045922910571174154, + "ewc_loss": 4.7124922275543213e-07, + "grad_norm": 4.1783928871154785, + "learning_rate": 1.526070368800339e-07, + "loss": 0.5355, + "mean_token_accuracy": 0.8290407657623291, + "num_tokens": 13786448.0, + "step": 361 + }, + { + "epoch": 0.04605012084976466, + "ewc_loss": 4.7497451305389404e-07, + "grad_norm": 3.743211269378662, + "learning_rate": 1.530309453158118e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.8307135105133057, + "num_tokens": 13827663.0, + "step": 362 + }, + { + "epoch": 0.04617733112835517, + "ewc_loss": 4.7497451305389404e-07, + "grad_norm": 4.953640937805176, + "learning_rate": 1.5345485375158963e-07, + "loss": 0.6385, + "mean_token_accuracy": 0.8054243922233582, + "num_tokens": 13866078.0, + "step": 363 + }, + { + "epoch": 0.04630454140694568, + "ewc_loss": 4.7497451305389404e-07, + "grad_norm": 3.4980275630950928, + "learning_rate": 1.5387876218736753e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8383395075798035, + "num_tokens": 13909767.0, + "step": 364 + }, + { + "epoch": 0.04643175168553619, + "ewc_loss": 4.76837158203125e-07, + "grad_norm": 4.267710208892822, + "learning_rate": 1.5430267062314538e-07, + "loss": 0.6235, + "mean_token_accuracy": 0.8102254867553711, + "num_tokens": 13948937.0, + "step": 365 + }, + { + "epoch": 0.0465589619641267, + "ewc_loss": 4.76837158203125e-07, + "grad_norm": 4.639916896820068, + "learning_rate": 1.5472657905892328e-07, + "loss": 0.6342, + "mean_token_accuracy": 0.8050104379653931, + "num_tokens": 13984039.0, + "step": 366 + }, + { + "epoch": 0.04668617224271721, + "ewc_loss": 4.76837158203125e-07, + "grad_norm": 5.675799369812012, + "learning_rate": 1.5515048749470113e-07, + "loss": 0.6166, + "mean_token_accuracy": 0.809571385383606, + "num_tokens": 14018162.0, + "step": 367 + }, + { + "epoch": 0.04681338252130772, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 4.066525459289551, + "learning_rate": 1.55574395930479e-07, + "loss": 0.6162, + "mean_token_accuracy": 0.8100687265396118, + "num_tokens": 14056493.0, + "step": 368 + }, + { + "epoch": 0.046940592799898234, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 4.8712944984436035, + "learning_rate": 1.5599830436625687e-07, + "loss": 0.6093, + "mean_token_accuracy": 0.813296914100647, + "num_tokens": 14097530.0, + "step": 369 + }, + { + "epoch": 0.04706780307848874, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 4.73914098739624, + "learning_rate": 1.5642221280203474e-07, + "loss": 0.6279, + "mean_token_accuracy": 0.8101433515548706, + "num_tokens": 14136240.0, + "step": 370 + }, + { + "epoch": 0.04719501335707925, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 5.847086429595947, + "learning_rate": 1.5684612123781262e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.8311153650283813, + "num_tokens": 14171010.0, + "step": 371 + }, + { + "epoch": 0.04732222363566976, + "ewc_loss": 4.954636096954346e-07, + "grad_norm": 3.7140958309173584, + "learning_rate": 1.572700296735905e-07, + "loss": 0.6107, + "mean_token_accuracy": 0.8178529739379883, + "num_tokens": 14206985.0, + "step": 372 + }, + { + "epoch": 0.04744943391426027, + "ewc_loss": 4.954636096954346e-07, + "grad_norm": 4.722815990447998, + "learning_rate": 1.576939381093684e-07, + "loss": 0.6082, + "mean_token_accuracy": 0.8091747760772705, + "num_tokens": 14236728.0, + "step": 373 + }, + { + "epoch": 0.047576644192850785, + "ewc_loss": 4.954636096954346e-07, + "grad_norm": 4.032540321350098, + "learning_rate": 1.5811784654514623e-07, + "loss": 0.5358, + "mean_token_accuracy": 0.8344739675521851, + "num_tokens": 14270758.0, + "step": 374 + }, + { + "epoch": 0.04770385447144129, + "ewc_loss": 4.991888999938965e-07, + "grad_norm": 3.684112787246704, + "learning_rate": 1.5854175498092413e-07, + "loss": 0.6529, + "mean_token_accuracy": 0.7969425916671753, + "num_tokens": 14315002.0, + "step": 375 + }, + { + "epoch": 0.0478310647500318, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 4.017214298248291, + "learning_rate": 1.5896566341670198e-07, + "loss": 0.6228, + "mean_token_accuracy": 0.805560827255249, + "num_tokens": 14353385.0, + "step": 376 + }, + { + "epoch": 0.047958275028622314, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 4.816952705383301, + "learning_rate": 1.5938957185247988e-07, + "loss": 0.6561, + "mean_token_accuracy": 0.7944432497024536, + "num_tokens": 14391702.0, + "step": 377 + }, + { + "epoch": 0.04808548530721282, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 4.350587844848633, + "learning_rate": 1.5981348028825772e-07, + "loss": 0.6236, + "mean_token_accuracy": 0.8072555065155029, + "num_tokens": 14427876.0, + "step": 378 + }, + { + "epoch": 0.048212695585803336, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 5.194976329803467, + "learning_rate": 1.6023738872403562e-07, + "loss": 0.7096, + "mean_token_accuracy": 0.7810785174369812, + "num_tokens": 14460349.0, + "step": 379 + }, + { + "epoch": 0.048339905864393844, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 3.9218876361846924, + "learning_rate": 1.6066129715981347e-07, + "loss": 0.5601, + "mean_token_accuracy": 0.8280526995658875, + "num_tokens": 14497763.0, + "step": 380 + }, + { + "epoch": 0.04846711614298435, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 3.0234298706054688, + "learning_rate": 1.6108520559559137e-07, + "loss": 0.5538, + "mean_token_accuracy": 0.8261250257492065, + "num_tokens": 14537172.0, + "step": 381 + }, + { + "epoch": 0.048594326421574865, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 4.0370049476623535, + "learning_rate": 1.6150911403136921e-07, + "loss": 0.6398, + "mean_token_accuracy": 0.8078353404998779, + "num_tokens": 14565515.0, + "step": 382 + }, + { + "epoch": 0.04872153670016537, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 3.2036616802215576, + "learning_rate": 1.619330224671471e-07, + "loss": 0.6314, + "mean_token_accuracy": 0.8073036074638367, + "num_tokens": 14608104.0, + "step": 383 + }, + { + "epoch": 0.04884874697875588, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 4.704512596130371, + "learning_rate": 1.6235693090292496e-07, + "loss": 0.6404, + "mean_token_accuracy": 0.807220458984375, + "num_tokens": 14645328.0, + "step": 384 + }, + { + "epoch": 0.048975957257346395, + "ewc_loss": 5.066394805908203e-07, + "grad_norm": 3.896965980529785, + "learning_rate": 1.6278083933870286e-07, + "loss": 0.6722, + "mean_token_accuracy": 0.7906695604324341, + "num_tokens": 14678792.0, + "step": 385 + }, + { + "epoch": 0.0491031675359369, + "ewc_loss": 5.066394805908203e-07, + "grad_norm": 3.59096622467041, + "learning_rate": 1.632047477744807e-07, + "loss": 0.5957, + "mean_token_accuracy": 0.8170809149742126, + "num_tokens": 14715095.0, + "step": 386 + }, + { + "epoch": 0.049230377814527417, + "ewc_loss": 5.066394805908203e-07, + "grad_norm": 3.749987840652466, + "learning_rate": 1.6362865621025858e-07, + "loss": 0.6155, + "mean_token_accuracy": 0.810562252998352, + "num_tokens": 14753641.0, + "step": 387 + }, + { + "epoch": 0.049357588093117924, + "ewc_loss": 5.066394805908203e-07, + "grad_norm": 3.8780555725097656, + "learning_rate": 1.6405256464603645e-07, + "loss": 0.5866, + "mean_token_accuracy": 0.8174591064453125, + "num_tokens": 14788816.0, + "step": 388 + }, + { + "epoch": 0.04948479837170843, + "ewc_loss": 5.103647708892822e-07, + "grad_norm": 4.3930745124816895, + "learning_rate": 1.6447647308181432e-07, + "loss": 0.6305, + "mean_token_accuracy": 0.8050038814544678, + "num_tokens": 14821607.0, + "step": 389 + }, + { + "epoch": 0.049612008650298946, + "ewc_loss": 5.066394805908203e-07, + "grad_norm": 3.1949470043182373, + "learning_rate": 1.649003815175922e-07, + "loss": 0.6481, + "mean_token_accuracy": 0.803152322769165, + "num_tokens": 14861298.0, + "step": 390 + }, + { + "epoch": 0.04973921892888945, + "ewc_loss": 5.103647708892822e-07, + "grad_norm": 3.489229440689087, + "learning_rate": 1.6532428995337007e-07, + "loss": 0.5649, + "mean_token_accuracy": 0.8238948583602905, + "num_tokens": 14899951.0, + "step": 391 + }, + { + "epoch": 0.04986642920747997, + "ewc_loss": 5.140900611877441e-07, + "grad_norm": 3.948596715927124, + "learning_rate": 1.6574819838914794e-07, + "loss": 0.5462, + "mean_token_accuracy": 0.8308666944503784, + "num_tokens": 14937169.0, + "step": 392 + }, + { + "epoch": 0.049993639486070475, + "ewc_loss": 5.178153514862061e-07, + "grad_norm": 3.732072353363037, + "learning_rate": 1.661721068249258e-07, + "loss": 0.5981, + "mean_token_accuracy": 0.8149612545967102, + "num_tokens": 14975549.0, + "step": 393 + }, + { + "epoch": 0.05012084976466098, + "ewc_loss": 5.178153514862061e-07, + "grad_norm": 3.5528714656829834, + "learning_rate": 1.6659601526070368e-07, + "loss": 0.6257, + "mean_token_accuracy": 0.8080010414123535, + "num_tokens": 15016630.0, + "step": 394 + }, + { + "epoch": 0.0502480600432515, + "ewc_loss": 5.178153514862061e-07, + "grad_norm": 3.6253228187561035, + "learning_rate": 1.6701992369648156e-07, + "loss": 0.6077, + "mean_token_accuracy": 0.8111173510551453, + "num_tokens": 15057822.0, + "step": 395 + }, + { + "epoch": 0.050375270321842004, + "ewc_loss": 5.21540641784668e-07, + "grad_norm": 4.704326152801514, + "learning_rate": 1.6744383213225943e-07, + "loss": 0.5803, + "mean_token_accuracy": 0.8161640763282776, + "num_tokens": 15094385.0, + "step": 396 + }, + { + "epoch": 0.05050248060043251, + "ewc_loss": 5.21540641784668e-07, + "grad_norm": 4.567748069763184, + "learning_rate": 1.678677405680373e-07, + "loss": 0.54, + "mean_token_accuracy": 0.8279776573181152, + "num_tokens": 15130969.0, + "step": 397 + }, + { + "epoch": 0.050629690879023026, + "ewc_loss": 5.289912223815918e-07, + "grad_norm": 4.573604106903076, + "learning_rate": 1.6829164900381518e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8217571973800659, + "num_tokens": 15166586.0, + "step": 398 + }, + { + "epoch": 0.05075690115761353, + "ewc_loss": 5.252659320831299e-07, + "grad_norm": 3.5580153465270996, + "learning_rate": 1.6871555743959305e-07, + "loss": 0.5514, + "mean_token_accuracy": 0.8247906565666199, + "num_tokens": 15209603.0, + "step": 399 + }, + { + "epoch": 0.05088411143620405, + "ewc_loss": 5.252659320831299e-07, + "grad_norm": 4.099836826324463, + "learning_rate": 1.6913946587537092e-07, + "loss": 0.5689, + "mean_token_accuracy": 0.8207578659057617, + "num_tokens": 15251542.0, + "step": 400 + }, + { + "epoch": 0.051011321714794555, + "ewc_loss": 5.289912223815918e-07, + "grad_norm": 4.403663635253906, + "learning_rate": 1.695633743111488e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8204394578933716, + "num_tokens": 15288940.0, + "step": 401 + }, + { + "epoch": 0.05113853199338506, + "ewc_loss": 5.327165126800537e-07, + "grad_norm": 3.364311933517456, + "learning_rate": 1.6998728274692667e-07, + "loss": 0.55, + "mean_token_accuracy": 0.8322638273239136, + "num_tokens": 15321965.0, + "step": 402 + }, + { + "epoch": 0.05126574227197558, + "ewc_loss": 5.327165126800537e-07, + "grad_norm": 3.495732069015503, + "learning_rate": 1.7041119118270454e-07, + "loss": 0.559, + "mean_token_accuracy": 0.8275420665740967, + "num_tokens": 15362895.0, + "step": 403 + }, + { + "epoch": 0.051392952550566084, + "ewc_loss": 5.327165126800537e-07, + "grad_norm": 5.366794586181641, + "learning_rate": 1.7083509961848238e-07, + "loss": 0.6159, + "mean_token_accuracy": 0.8083093166351318, + "num_tokens": 15396675.0, + "step": 404 + }, + { + "epoch": 0.0515201628291566, + "ewc_loss": 5.364418029785156e-07, + "grad_norm": 4.732341766357422, + "learning_rate": 1.7125900805426028e-07, + "loss": 0.6378, + "mean_token_accuracy": 0.8036385178565979, + "num_tokens": 15443070.0, + "step": 405 + }, + { + "epoch": 0.051647373107747106, + "ewc_loss": 5.364418029785156e-07, + "grad_norm": 3.6799991130828857, + "learning_rate": 1.7168291649003813e-07, + "loss": 0.5644, + "mean_token_accuracy": 0.8237192630767822, + "num_tokens": 15476747.0, + "step": 406 + }, + { + "epoch": 0.051774583386337614, + "ewc_loss": 5.401670932769775e-07, + "grad_norm": 4.065703392028809, + "learning_rate": 1.7210682492581603e-07, + "loss": 0.5902, + "mean_token_accuracy": 0.8186886310577393, + "num_tokens": 15514866.0, + "step": 407 + }, + { + "epoch": 0.05190179366492813, + "ewc_loss": 5.401670932769775e-07, + "grad_norm": 3.685351848602295, + "learning_rate": 1.7253073336159387e-07, + "loss": 0.5673, + "mean_token_accuracy": 0.8213762044906616, + "num_tokens": 15554245.0, + "step": 408 + }, + { + "epoch": 0.052029003943518635, + "ewc_loss": 5.401670932769775e-07, + "grad_norm": 3.473944664001465, + "learning_rate": 1.7295464179737177e-07, + "loss": 0.5964, + "mean_token_accuracy": 0.8165558576583862, + "num_tokens": 15593958.0, + "step": 409 + }, + { + "epoch": 0.05215621422210915, + "ewc_loss": 5.438923835754395e-07, + "grad_norm": 3.72444748878479, + "learning_rate": 1.7337855023314962e-07, + "loss": 0.5414, + "mean_token_accuracy": 0.8305819034576416, + "num_tokens": 15632567.0, + "step": 410 + }, + { + "epoch": 0.05228342450069966, + "ewc_loss": 5.438923835754395e-07, + "grad_norm": 4.693933486938477, + "learning_rate": 1.7380245866892752e-07, + "loss": 0.5506, + "mean_token_accuracy": 0.8267899751663208, + "num_tokens": 15668255.0, + "step": 411 + }, + { + "epoch": 0.052410634779290165, + "ewc_loss": 5.513429641723633e-07, + "grad_norm": 3.7435028553009033, + "learning_rate": 1.7422636710470536e-07, + "loss": 0.603, + "mean_token_accuracy": 0.8121923208236694, + "num_tokens": 15705435.0, + "step": 412 + }, + { + "epoch": 0.05253784505788068, + "ewc_loss": 5.476176738739014e-07, + "grad_norm": 3.3608803749084473, + "learning_rate": 1.7465027554048326e-07, + "loss": 0.623, + "mean_token_accuracy": 0.8064613342285156, + "num_tokens": 15744657.0, + "step": 413 + }, + { + "epoch": 0.05266505533647119, + "ewc_loss": 5.587935447692871e-07, + "grad_norm": 3.9951436519622803, + "learning_rate": 1.750741839762611e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8256915807723999, + "num_tokens": 15777374.0, + "step": 414 + }, + { + "epoch": 0.052792265615061694, + "ewc_loss": 5.587935447692871e-07, + "grad_norm": 3.0920112133026123, + "learning_rate": 1.75498092412039e-07, + "loss": 0.5387, + "mean_token_accuracy": 0.8317774534225464, + "num_tokens": 15814047.0, + "step": 415 + }, + { + "epoch": 0.05291947589365221, + "ewc_loss": 5.587935447692871e-07, + "grad_norm": 3.5327165126800537, + "learning_rate": 1.7592200084781686e-07, + "loss": 0.6199, + "mean_token_accuracy": 0.8073428273200989, + "num_tokens": 15849110.0, + "step": 416 + }, + { + "epoch": 0.053046686172242716, + "ewc_loss": 5.587935447692871e-07, + "grad_norm": 3.191091299057007, + "learning_rate": 1.7634590928359475e-07, + "loss": 0.5904, + "mean_token_accuracy": 0.8190550804138184, + "num_tokens": 15888910.0, + "step": 417 + }, + { + "epoch": 0.05317389645083323, + "ewc_loss": 5.587935447692871e-07, + "grad_norm": 3.943847894668579, + "learning_rate": 1.767698177193726e-07, + "loss": 0.5769, + "mean_token_accuracy": 0.8202989101409912, + "num_tokens": 15925604.0, + "step": 418 + }, + { + "epoch": 0.05330110672942374, + "ewc_loss": 5.62518835067749e-07, + "grad_norm": 3.494607925415039, + "learning_rate": 1.771937261551505e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8335659503936768, + "num_tokens": 15961401.0, + "step": 419 + }, + { + "epoch": 0.053428317008014245, + "ewc_loss": 5.662441253662109e-07, + "grad_norm": 3.750483512878418, + "learning_rate": 1.7761763459092835e-07, + "loss": 0.582, + "mean_token_accuracy": 0.8200175166130066, + "num_tokens": 16001029.0, + "step": 420 + }, + { + "epoch": 0.05355552728660476, + "ewc_loss": 5.699694156646729e-07, + "grad_norm": 3.5619943141937256, + "learning_rate": 1.7804154302670624e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.8245970606803894, + "num_tokens": 16036014.0, + "step": 421 + }, + { + "epoch": 0.05368273756519527, + "ewc_loss": 5.699694156646729e-07, + "grad_norm": 3.164228677749634, + "learning_rate": 1.784654514624841e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.8272371888160706, + "num_tokens": 16075373.0, + "step": 422 + }, + { + "epoch": 0.05380994784378578, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 3.3792622089385986, + "learning_rate": 1.7888935989826196e-07, + "loss": 0.5872, + "mean_token_accuracy": 0.8146610260009766, + "num_tokens": 16111352.0, + "step": 423 + }, + { + "epoch": 0.05393715812237629, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 3.328418493270874, + "learning_rate": 1.7931326833403984e-07, + "loss": 0.6027, + "mean_token_accuracy": 0.8112437725067139, + "num_tokens": 16148826.0, + "step": 424 + }, + { + "epoch": 0.054064368400966796, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 3.0587823390960693, + "learning_rate": 1.797371767698177e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8408454656600952, + "num_tokens": 16193491.0, + "step": 425 + }, + { + "epoch": 0.05419157867955731, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 3.5568318367004395, + "learning_rate": 1.8016108520559558e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.8258209228515625, + "num_tokens": 16235595.0, + "step": 426 + }, + { + "epoch": 0.05431878895814782, + "ewc_loss": 5.811452865600586e-07, + "grad_norm": 3.6981935501098633, + "learning_rate": 1.8058499364137345e-07, + "loss": 0.6133, + "mean_token_accuracy": 0.8148624897003174, + "num_tokens": 16274105.0, + "step": 427 + }, + { + "epoch": 0.054445999236738325, + "ewc_loss": 5.811452865600586e-07, + "grad_norm": 3.3859059810638428, + "learning_rate": 1.8100890207715133e-07, + "loss": 0.6568, + "mean_token_accuracy": 0.7938722372055054, + "num_tokens": 16315886.0, + "step": 428 + }, + { + "epoch": 0.05457320951532884, + "ewc_loss": 5.848705768585205e-07, + "grad_norm": 3.3136417865753174, + "learning_rate": 1.814328105129292e-07, + "loss": 0.598, + "mean_token_accuracy": 0.8128254413604736, + "num_tokens": 16353021.0, + "step": 429 + }, + { + "epoch": 0.05470041979391935, + "ewc_loss": 5.885958671569824e-07, + "grad_norm": 4.369722843170166, + "learning_rate": 1.8185671894870707e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8183128237724304, + "num_tokens": 16380887.0, + "step": 430 + }, + { + "epoch": 0.05482763007250986, + "ewc_loss": 5.885958671569824e-07, + "grad_norm": 2.921778440475464, + "learning_rate": 1.8228062738448494e-07, + "loss": 0.6288, + "mean_token_accuracy": 0.8066331148147583, + "num_tokens": 16421066.0, + "step": 431 + }, + { + "epoch": 0.05495484035110037, + "ewc_loss": 5.923211574554443e-07, + "grad_norm": 3.5038094520568848, + "learning_rate": 1.8270453582026282e-07, + "loss": 0.597, + "mean_token_accuracy": 0.813459038734436, + "num_tokens": 16457197.0, + "step": 432 + }, + { + "epoch": 0.055082050629690876, + "ewc_loss": 5.923211574554443e-07, + "grad_norm": 2.7281014919281006, + "learning_rate": 1.831284442560407e-07, + "loss": 0.5527, + "mean_token_accuracy": 0.82685387134552, + "num_tokens": 16497385.0, + "step": 433 + }, + { + "epoch": 0.05520926090828139, + "ewc_loss": 5.960464477539062e-07, + "grad_norm": 3.298003673553467, + "learning_rate": 1.8355235269181856e-07, + "loss": 0.5681, + "mean_token_accuracy": 0.8235843777656555, + "num_tokens": 16540099.0, + "step": 434 + }, + { + "epoch": 0.0553364711868719, + "ewc_loss": 6.07222318649292e-07, + "grad_norm": 3.3805038928985596, + "learning_rate": 1.8397626112759643e-07, + "loss": 0.5639, + "mean_token_accuracy": 0.823733389377594, + "num_tokens": 16572760.0, + "step": 435 + }, + { + "epoch": 0.05546368146546241, + "ewc_loss": 6.07222318649292e-07, + "grad_norm": 2.8338370323181152, + "learning_rate": 1.844001695633743e-07, + "loss": 0.5235, + "mean_token_accuracy": 0.8348414897918701, + "num_tokens": 16608549.0, + "step": 436 + }, + { + "epoch": 0.05559089174405292, + "ewc_loss": 6.109476089477539e-07, + "grad_norm": 3.620323419570923, + "learning_rate": 1.8482407799915218e-07, + "loss": 0.5607, + "mean_token_accuracy": 0.8252310752868652, + "num_tokens": 16643800.0, + "step": 437 + }, + { + "epoch": 0.05571810202264343, + "ewc_loss": 6.146728992462158e-07, + "grad_norm": 3.437763214111328, + "learning_rate": 1.8524798643493005e-07, + "loss": 0.6337, + "mean_token_accuracy": 0.8004281520843506, + "num_tokens": 16677948.0, + "step": 438 + }, + { + "epoch": 0.05584531230123394, + "ewc_loss": 6.183981895446777e-07, + "grad_norm": 3.1172735691070557, + "learning_rate": 1.8567189487070792e-07, + "loss": 0.6079, + "mean_token_accuracy": 0.8120819330215454, + "num_tokens": 16716905.0, + "step": 439 + }, + { + "epoch": 0.05597252257982445, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 3.2881386280059814, + "learning_rate": 1.8609580330648577e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.8355753421783447, + "num_tokens": 16756158.0, + "step": 440 + }, + { + "epoch": 0.05609973285841496, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 3.8303866386413574, + "learning_rate": 1.8651971174226367e-07, + "loss": 0.6104, + "mean_token_accuracy": 0.8137568235397339, + "num_tokens": 16794261.0, + "step": 441 + }, + { + "epoch": 0.05622694313700547, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 3.0885989665985107, + "learning_rate": 1.8694362017804152e-07, + "loss": 0.5675, + "mean_token_accuracy": 0.8216005563735962, + "num_tokens": 16834953.0, + "step": 442 + }, + { + "epoch": 0.05635415341559598, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 3.5379791259765625, + "learning_rate": 1.8736752861381941e-07, + "loss": 0.526, + "mean_token_accuracy": 0.8357124328613281, + "num_tokens": 16873954.0, + "step": 443 + }, + { + "epoch": 0.05648136369418649, + "ewc_loss": 6.258487701416016e-07, + "grad_norm": 3.347118854522705, + "learning_rate": 1.8779143704959726e-07, + "loss": 0.548, + "mean_token_accuracy": 0.8259832859039307, + "num_tokens": 16906018.0, + "step": 444 + }, + { + "epoch": 0.056608573972777, + "ewc_loss": 6.295740604400635e-07, + "grad_norm": 3.6007137298583984, + "learning_rate": 1.8821534548537516e-07, + "loss": 0.601, + "mean_token_accuracy": 0.8141239881515503, + "num_tokens": 16938945.0, + "step": 445 + }, + { + "epoch": 0.05673578425136751, + "ewc_loss": 6.295740604400635e-07, + "grad_norm": 3.027589797973633, + "learning_rate": 1.88639253921153e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.8273853063583374, + "num_tokens": 16982214.0, + "step": 446 + }, + { + "epoch": 0.05686299452995802, + "ewc_loss": 6.332993507385254e-07, + "grad_norm": 3.2546944618225098, + "learning_rate": 1.890631623569309e-07, + "loss": 0.569, + "mean_token_accuracy": 0.821973443031311, + "num_tokens": 17024367.0, + "step": 447 + }, + { + "epoch": 0.05699020480854853, + "ewc_loss": 6.332993507385254e-07, + "grad_norm": 3.610473394393921, + "learning_rate": 1.8948707079270875e-07, + "loss": 0.5902, + "mean_token_accuracy": 0.8115723133087158, + "num_tokens": 17057451.0, + "step": 448 + }, + { + "epoch": 0.057117415087139044, + "ewc_loss": 6.332993507385254e-07, + "grad_norm": 4.371118545532227, + "learning_rate": 1.8991097922848665e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8295284509658813, + "num_tokens": 17094633.0, + "step": 449 + }, + { + "epoch": 0.05724462536572955, + "ewc_loss": 6.332993507385254e-07, + "grad_norm": 3.4244072437286377, + "learning_rate": 1.903348876642645e-07, + "loss": 0.596, + "mean_token_accuracy": 0.8114916682243347, + "num_tokens": 17135892.0, + "step": 450 + }, + { + "epoch": 0.05737183564432006, + "ewc_loss": 6.332993507385254e-07, + "grad_norm": 3.532428026199341, + "learning_rate": 1.907587961000424e-07, + "loss": 0.5392, + "mean_token_accuracy": 0.831398606300354, + "num_tokens": 17173881.0, + "step": 451 + }, + { + "epoch": 0.05749904592291057, + "ewc_loss": 6.370246410369873e-07, + "grad_norm": 3.3205442428588867, + "learning_rate": 1.9118270453582024e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.8173248171806335, + "num_tokens": 17215668.0, + "step": 452 + }, + { + "epoch": 0.05762625620150108, + "ewc_loss": 6.370246410369873e-07, + "grad_norm": 3.3115315437316895, + "learning_rate": 1.9160661297159814e-07, + "loss": 0.6109, + "mean_token_accuracy": 0.8118069171905518, + "num_tokens": 17254339.0, + "step": 453 + }, + { + "epoch": 0.057753466480091595, + "ewc_loss": 6.407499313354492e-07, + "grad_norm": 3.58941650390625, + "learning_rate": 1.9203052140737599e-07, + "loss": 0.5971, + "mean_token_accuracy": 0.8115682005882263, + "num_tokens": 17288858.0, + "step": 454 + }, + { + "epoch": 0.0578806767586821, + "ewc_loss": 6.407499313354492e-07, + "grad_norm": 3.310730457305908, + "learning_rate": 1.9245442984315389e-07, + "loss": 0.5735, + "mean_token_accuracy": 0.8239067196846008, + "num_tokens": 17330601.0, + "step": 455 + }, + { + "epoch": 0.05800788703727261, + "ewc_loss": 6.444752216339111e-07, + "grad_norm": 3.243911027908325, + "learning_rate": 1.9287833827893173e-07, + "loss": 0.5931, + "mean_token_accuracy": 0.8144667148590088, + "num_tokens": 17367797.0, + "step": 456 + }, + { + "epoch": 0.058135097315863124, + "ewc_loss": 6.444752216339111e-07, + "grad_norm": 3.29453706741333, + "learning_rate": 1.9330224671470963e-07, + "loss": 0.5782, + "mean_token_accuracy": 0.8197800517082214, + "num_tokens": 17403206.0, + "step": 457 + }, + { + "epoch": 0.05826230759445363, + "ewc_loss": 6.48200511932373e-07, + "grad_norm": 3.4520821571350098, + "learning_rate": 1.9372615515048748e-07, + "loss": 0.6242, + "mean_token_accuracy": 0.8075498342514038, + "num_tokens": 17437207.0, + "step": 458 + }, + { + "epoch": 0.05838951787304414, + "ewc_loss": 6.48200511932373e-07, + "grad_norm": 3.341430187225342, + "learning_rate": 1.9415006358626535e-07, + "loss": 0.5606, + "mean_token_accuracy": 0.8267058730125427, + "num_tokens": 17473335.0, + "step": 459 + }, + { + "epoch": 0.05851672815163465, + "ewc_loss": 6.51925802230835e-07, + "grad_norm": 3.97370982170105, + "learning_rate": 1.9457397202204322e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8239080309867859, + "num_tokens": 17505104.0, + "step": 460 + }, + { + "epoch": 0.05864393843022516, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 3.597811460494995, + "learning_rate": 1.949978804578211e-07, + "loss": 0.5341, + "mean_token_accuracy": 0.8337923884391785, + "num_tokens": 17539052.0, + "step": 461 + }, + { + "epoch": 0.058771148708815675, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 2.8391215801239014, + "learning_rate": 1.9542178889359897e-07, + "loss": 0.608, + "mean_token_accuracy": 0.8092217445373535, + "num_tokens": 17578942.0, + "step": 462 + }, + { + "epoch": 0.05889835898740618, + "ewc_loss": 6.593763828277588e-07, + "grad_norm": 3.29502534866333, + "learning_rate": 1.9584569732937684e-07, + "loss": 0.5854, + "mean_token_accuracy": 0.8211100101470947, + "num_tokens": 17611931.0, + "step": 463 + }, + { + "epoch": 0.05902556926599669, + "ewc_loss": 6.593763828277588e-07, + "grad_norm": 3.3372766971588135, + "learning_rate": 1.962696057651547e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.8449795246124268, + "num_tokens": 17648764.0, + "step": 464 + }, + { + "epoch": 0.059152779544587204, + "ewc_loss": 6.705522537231445e-07, + "grad_norm": 3.4397010803222656, + "learning_rate": 1.9669351420093258e-07, + "loss": 0.602, + "mean_token_accuracy": 0.8121716380119324, + "num_tokens": 17683099.0, + "step": 465 + }, + { + "epoch": 0.05927998982317771, + "ewc_loss": 6.742775440216064e-07, + "grad_norm": 3.119743585586548, + "learning_rate": 1.9711742263671046e-07, + "loss": 0.5881, + "mean_token_accuracy": 0.8207216858863831, + "num_tokens": 17727120.0, + "step": 466 + }, + { + "epoch": 0.059407200101768226, + "ewc_loss": 6.742775440216064e-07, + "grad_norm": 3.7774643898010254, + "learning_rate": 1.9754133107248833e-07, + "loss": 0.5409, + "mean_token_accuracy": 0.8245866298675537, + "num_tokens": 17761676.0, + "step": 467 + }, + { + "epoch": 0.059534410380358734, + "ewc_loss": 6.742775440216064e-07, + "grad_norm": 2.7458274364471436, + "learning_rate": 1.979652395082662e-07, + "loss": 0.6098, + "mean_token_accuracy": 0.8118389844894409, + "num_tokens": 17802311.0, + "step": 468 + }, + { + "epoch": 0.05966162065894924, + "ewc_loss": 6.780028343200684e-07, + "grad_norm": 2.532662868499756, + "learning_rate": 1.9838914794404408e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8392121195793152, + "num_tokens": 17839048.0, + "step": 469 + }, + { + "epoch": 0.059788830937539755, + "ewc_loss": 6.742775440216064e-07, + "grad_norm": 2.741809844970703, + "learning_rate": 1.9881305637982195e-07, + "loss": 0.5722, + "mean_token_accuracy": 0.8211686611175537, + "num_tokens": 17881510.0, + "step": 470 + }, + { + "epoch": 0.05991604121613026, + "ewc_loss": 6.780028343200684e-07, + "grad_norm": 3.882176637649536, + "learning_rate": 1.9923696481559982e-07, + "loss": 0.5965, + "mean_token_accuracy": 0.8160213232040405, + "num_tokens": 17915080.0, + "step": 471 + }, + { + "epoch": 0.06004325149472077, + "ewc_loss": 6.780028343200684e-07, + "grad_norm": 3.2533085346221924, + "learning_rate": 1.996608732513777e-07, + "loss": 0.6298, + "mean_token_accuracy": 0.8055557608604431, + "num_tokens": 17957972.0, + "step": 472 + }, + { + "epoch": 0.060170461773311285, + "ewc_loss": 7.003545761108398e-07, + "grad_norm": 2.7785181999206543, + "learning_rate": 2.0008478168715557e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8388392925262451, + "num_tokens": 17998008.0, + "step": 473 + }, + { + "epoch": 0.06029767205190179, + "ewc_loss": 7.078051567077637e-07, + "grad_norm": 2.912216901779175, + "learning_rate": 2.0050869012293344e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8359943628311157, + "num_tokens": 18032427.0, + "step": 474 + }, + { + "epoch": 0.060424882330492306, + "ewc_loss": 7.115304470062256e-07, + "grad_norm": 2.6883749961853027, + "learning_rate": 2.009325985587113e-07, + "loss": 0.5387, + "mean_token_accuracy": 0.8299548029899597, + "num_tokens": 18069848.0, + "step": 475 + }, + { + "epoch": 0.060552092609082814, + "ewc_loss": 7.115304470062256e-07, + "grad_norm": 2.8319172859191895, + "learning_rate": 2.0135650699448918e-07, + "loss": 0.6308, + "mean_token_accuracy": 0.8032740354537964, + "num_tokens": 18109240.0, + "step": 476 + }, + { + "epoch": 0.06067930288767332, + "ewc_loss": 7.115304470062256e-07, + "grad_norm": 3.0766468048095703, + "learning_rate": 2.0178041543026706e-07, + "loss": 0.601, + "mean_token_accuracy": 0.814417839050293, + "num_tokens": 18146849.0, + "step": 477 + }, + { + "epoch": 0.060806513166263836, + "ewc_loss": 7.152557373046875e-07, + "grad_norm": 3.731172800064087, + "learning_rate": 2.022043238660449e-07, + "loss": 0.6313, + "mean_token_accuracy": 0.8032058477401733, + "num_tokens": 18186872.0, + "step": 478 + }, + { + "epoch": 0.06093372344485434, + "ewc_loss": 7.152557373046875e-07, + "grad_norm": 2.8847410678863525, + "learning_rate": 2.026282323018228e-07, + "loss": 0.6067, + "mean_token_accuracy": 0.8129768371582031, + "num_tokens": 18222299.0, + "step": 479 + }, + { + "epoch": 0.06106093372344486, + "ewc_loss": 7.227063179016113e-07, + "grad_norm": 2.5058116912841797, + "learning_rate": 2.0305214073760065e-07, + "loss": 0.6107, + "mean_token_accuracy": 0.8133139610290527, + "num_tokens": 18261645.0, + "step": 480 + }, + { + "epoch": 0.061188144002035365, + "ewc_loss": 7.189810276031494e-07, + "grad_norm": 2.7434613704681396, + "learning_rate": 2.0347604917337855e-07, + "loss": 0.5812, + "mean_token_accuracy": 0.8186022043228149, + "num_tokens": 18292970.0, + "step": 481 + }, + { + "epoch": 0.06131535428062587, + "ewc_loss": 7.227063179016113e-07, + "grad_norm": 2.7249999046325684, + "learning_rate": 2.038999576091564e-07, + "loss": 0.5293, + "mean_token_accuracy": 0.835991382598877, + "num_tokens": 18331133.0, + "step": 482 + }, + { + "epoch": 0.06144256455921639, + "ewc_loss": 7.264316082000732e-07, + "grad_norm": 3.045933246612549, + "learning_rate": 2.043238660449343e-07, + "loss": 0.6154, + "mean_token_accuracy": 0.8087680339813232, + "num_tokens": 18368905.0, + "step": 483 + }, + { + "epoch": 0.061569774837806894, + "ewc_loss": 7.301568984985352e-07, + "grad_norm": 2.8851635456085205, + "learning_rate": 2.0474777448071214e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.8323422074317932, + "num_tokens": 18406362.0, + "step": 484 + }, + { + "epoch": 0.0616969851163974, + "ewc_loss": 7.413327693939209e-07, + "grad_norm": 3.4241368770599365, + "learning_rate": 2.0517168291649004e-07, + "loss": 0.5758, + "mean_token_accuracy": 0.8210073709487915, + "num_tokens": 18439246.0, + "step": 485 + }, + { + "epoch": 0.061824195394987916, + "ewc_loss": 7.37607479095459e-07, + "grad_norm": 2.9749557971954346, + "learning_rate": 2.0559559135226788e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8166208267211914, + "num_tokens": 18473408.0, + "step": 486 + }, + { + "epoch": 0.06195140567357842, + "ewc_loss": 7.37607479095459e-07, + "grad_norm": 2.9190192222595215, + "learning_rate": 2.0601949978804578e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8307129144668579, + "num_tokens": 18513246.0, + "step": 487 + }, + { + "epoch": 0.06207861595216894, + "ewc_loss": 7.413327693939209e-07, + "grad_norm": 2.6816115379333496, + "learning_rate": 2.0644340822382363e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8393481969833374, + "num_tokens": 18549125.0, + "step": 488 + }, + { + "epoch": 0.062205826230759445, + "ewc_loss": 7.413327693939209e-07, + "grad_norm": 2.773392677307129, + "learning_rate": 2.0686731665960153e-07, + "loss": 0.5569, + "mean_token_accuracy": 0.8309541940689087, + "num_tokens": 18585306.0, + "step": 489 + }, + { + "epoch": 0.06233303650934995, + "ewc_loss": 7.413327693939209e-07, + "grad_norm": 3.3996260166168213, + "learning_rate": 2.0729122509537937e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8384185433387756, + "num_tokens": 18624378.0, + "step": 490 + }, + { + "epoch": 0.06246024678794047, + "ewc_loss": 7.413327693939209e-07, + "grad_norm": 3.4545066356658936, + "learning_rate": 2.0771513353115727e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8242170810699463, + "num_tokens": 18660814.0, + "step": 491 + }, + { + "epoch": 0.06258745706653097, + "ewc_loss": 7.525086402893066e-07, + "grad_norm": 7.887014865875244, + "learning_rate": 2.0813904196693512e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8342187404632568, + "num_tokens": 18702129.0, + "step": 492 + }, + { + "epoch": 0.06271466734512149, + "ewc_loss": 7.487833499908447e-07, + "grad_norm": 3.2052831649780273, + "learning_rate": 2.0856295040271302e-07, + "loss": 0.5983, + "mean_token_accuracy": 0.8125495910644531, + "num_tokens": 18737496.0, + "step": 493 + }, + { + "epoch": 0.06284187762371199, + "ewc_loss": 7.525086402893066e-07, + "grad_norm": 2.4734225273132324, + "learning_rate": 2.0898685883849086e-07, + "loss": 0.5571, + "mean_token_accuracy": 0.8288863897323608, + "num_tokens": 18779332.0, + "step": 494 + }, + { + "epoch": 0.0629690879023025, + "ewc_loss": 7.525086402893066e-07, + "grad_norm": 2.4515891075134277, + "learning_rate": 2.0941076727426874e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8300517797470093, + "num_tokens": 18817976.0, + "step": 495 + }, + { + "epoch": 0.06309629818089302, + "ewc_loss": 7.525086402893066e-07, + "grad_norm": 2.0696542263031006, + "learning_rate": 2.098346757100466e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.8315675258636475, + "num_tokens": 18862860.0, + "step": 496 + }, + { + "epoch": 0.06322350845948353, + "ewc_loss": 7.487833499908447e-07, + "grad_norm": 2.6264572143554688, + "learning_rate": 2.1025858414582448e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8194637298583984, + "num_tokens": 18899025.0, + "step": 497 + }, + { + "epoch": 0.06335071873807403, + "ewc_loss": 7.487833499908447e-07, + "grad_norm": 2.623300075531006, + "learning_rate": 2.1068249258160238e-07, + "loss": 0.5391, + "mean_token_accuracy": 0.8243741989135742, + "num_tokens": 18933118.0, + "step": 498 + }, + { + "epoch": 0.06347792901666455, + "ewc_loss": 7.562339305877686e-07, + "grad_norm": 2.718092679977417, + "learning_rate": 2.1110640101738023e-07, + "loss": 0.5406, + "mean_token_accuracy": 0.8319485187530518, + "num_tokens": 18969165.0, + "step": 499 + }, + { + "epoch": 0.06360513929525506, + "ewc_loss": 7.674098014831543e-07, + "grad_norm": 2.908787727355957, + "learning_rate": 2.1153030945315813e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.8267939686775208, + "num_tokens": 19003882.0, + "step": 500 + }, + { + "epoch": 0.06373234957384556, + "ewc_loss": 7.711350917816162e-07, + "grad_norm": 2.6804895401000977, + "learning_rate": 2.1195421788893597e-07, + "loss": 0.5496, + "mean_token_accuracy": 0.8287574052810669, + "num_tokens": 19037540.0, + "step": 501 + }, + { + "epoch": 0.06385955985243608, + "ewc_loss": 7.748603820800781e-07, + "grad_norm": 2.712878465652466, + "learning_rate": 2.1237812632471387e-07, + "loss": 0.561, + "mean_token_accuracy": 0.8198551535606384, + "num_tokens": 19077270.0, + "step": 502 + }, + { + "epoch": 0.06398677013102659, + "ewc_loss": 7.748603820800781e-07, + "grad_norm": 2.7758774757385254, + "learning_rate": 2.1280203476049172e-07, + "loss": 0.6476, + "mean_token_accuracy": 0.7946560382843018, + "num_tokens": 19117579.0, + "step": 503 + }, + { + "epoch": 0.06411398040961709, + "ewc_loss": 7.7858567237854e-07, + "grad_norm": 2.6070237159729004, + "learning_rate": 2.1322594319626962e-07, + "loss": 0.5876, + "mean_token_accuracy": 0.8154790997505188, + "num_tokens": 19156599.0, + "step": 504 + }, + { + "epoch": 0.0642411906882076, + "ewc_loss": 7.82310962677002e-07, + "grad_norm": 2.822726249694824, + "learning_rate": 2.1364985163204746e-07, + "loss": 0.5775, + "mean_token_accuracy": 0.8188380002975464, + "num_tokens": 19187367.0, + "step": 505 + }, + { + "epoch": 0.06436840096679812, + "ewc_loss": 7.897615432739258e-07, + "grad_norm": 2.698958396911621, + "learning_rate": 2.1407376006782536e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8365070819854736, + "num_tokens": 19223916.0, + "step": 506 + }, + { + "epoch": 0.06449561124538863, + "ewc_loss": 7.934868335723877e-07, + "grad_norm": 2.6596996784210205, + "learning_rate": 2.144976685036032e-07, + "loss": 0.5283, + "mean_token_accuracy": 0.8346209526062012, + "num_tokens": 19260336.0, + "step": 507 + }, + { + "epoch": 0.06462282152397913, + "ewc_loss": 7.934868335723877e-07, + "grad_norm": 2.3162953853607178, + "learning_rate": 2.149215769393811e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8173232078552246, + "num_tokens": 19299895.0, + "step": 508 + }, + { + "epoch": 0.06475003180256965, + "ewc_loss": 7.972121238708496e-07, + "grad_norm": 2.599586009979248, + "learning_rate": 2.1534548537515895e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.8318362236022949, + "num_tokens": 19336614.0, + "step": 509 + }, + { + "epoch": 0.06487724208116016, + "ewc_loss": 8.009374141693115e-07, + "grad_norm": 2.2897439002990723, + "learning_rate": 2.1576939381093685e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8191546201705933, + "num_tokens": 19378683.0, + "step": 510 + }, + { + "epoch": 0.06500445235975066, + "ewc_loss": 8.083879947662354e-07, + "grad_norm": 2.2992794513702393, + "learning_rate": 2.161933022467147e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.819961667060852, + "num_tokens": 19422598.0, + "step": 511 + }, + { + "epoch": 0.06513166263834118, + "ewc_loss": 8.121132850646973e-07, + "grad_norm": 2.286520481109619, + "learning_rate": 2.166172106824926e-07, + "loss": 0.5576, + "mean_token_accuracy": 0.8285623788833618, + "num_tokens": 19466972.0, + "step": 512 + }, + { + "epoch": 0.06525887291693169, + "ewc_loss": 8.158385753631592e-07, + "grad_norm": 2.914865255355835, + "learning_rate": 2.1704111911827044e-07, + "loss": 0.5472, + "mean_token_accuracy": 0.8235335946083069, + "num_tokens": 19503809.0, + "step": 513 + }, + { + "epoch": 0.0653860831955222, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 2.3473598957061768, + "learning_rate": 2.1746502755404831e-07, + "loss": 0.6029, + "mean_token_accuracy": 0.8180657625198364, + "num_tokens": 19545713.0, + "step": 514 + }, + { + "epoch": 0.06551329347411271, + "ewc_loss": 8.270144462585449e-07, + "grad_norm": 2.5760045051574707, + "learning_rate": 2.178889359898262e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8196823000907898, + "num_tokens": 19582272.0, + "step": 515 + }, + { + "epoch": 0.06564050375270322, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 2.251791477203369, + "learning_rate": 2.1831284442560406e-07, + "loss": 0.6043, + "mean_token_accuracy": 0.8132323026657104, + "num_tokens": 19625574.0, + "step": 516 + }, + { + "epoch": 0.06576771403129372, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 2.392033338546753, + "learning_rate": 2.1873675286138193e-07, + "loss": 0.592, + "mean_token_accuracy": 0.8132587671279907, + "num_tokens": 19667791.0, + "step": 517 + }, + { + "epoch": 0.06589492430988424, + "ewc_loss": 8.270144462585449e-07, + "grad_norm": 2.699307918548584, + "learning_rate": 2.191606612971598e-07, + "loss": 0.6198, + "mean_token_accuracy": 0.8071184158325195, + "num_tokens": 19708289.0, + "step": 518 + }, + { + "epoch": 0.06602213458847475, + "ewc_loss": 8.307397365570068e-07, + "grad_norm": 2.341578960418701, + "learning_rate": 2.1958456973293768e-07, + "loss": 0.5993, + "mean_token_accuracy": 0.8081200122833252, + "num_tokens": 19747444.0, + "step": 519 + }, + { + "epoch": 0.06614934486706527, + "ewc_loss": 8.381903171539307e-07, + "grad_norm": 2.514535665512085, + "learning_rate": 2.2000847816871555e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.8408082127571106, + "num_tokens": 19787061.0, + "step": 520 + }, + { + "epoch": 0.06627655514565577, + "ewc_loss": 8.419156074523926e-07, + "grad_norm": 2.1212856769561768, + "learning_rate": 2.2043238660449342e-07, + "loss": 0.5299, + "mean_token_accuracy": 0.834848165512085, + "num_tokens": 19826893.0, + "step": 521 + }, + { + "epoch": 0.06640376542424628, + "ewc_loss": 8.419156074523926e-07, + "grad_norm": 2.239295482635498, + "learning_rate": 2.208562950402713e-07, + "loss": 0.523, + "mean_token_accuracy": 0.8318498134613037, + "num_tokens": 19864285.0, + "step": 522 + }, + { + "epoch": 0.0665309757028368, + "ewc_loss": 8.419156074523926e-07, + "grad_norm": 2.2001395225524902, + "learning_rate": 2.2128020347604917e-07, + "loss": 0.5587, + "mean_token_accuracy": 0.8254128694534302, + "num_tokens": 19903806.0, + "step": 523 + }, + { + "epoch": 0.0666581859814273, + "ewc_loss": 8.456408977508545e-07, + "grad_norm": 2.291118621826172, + "learning_rate": 2.2170411191182704e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8301997780799866, + "num_tokens": 19950382.0, + "step": 524 + }, + { + "epoch": 0.06678539626001781, + "ewc_loss": 8.493661880493164e-07, + "grad_norm": 2.4222006797790527, + "learning_rate": 2.221280203476049e-07, + "loss": 0.5599, + "mean_token_accuracy": 0.8224882483482361, + "num_tokens": 19984765.0, + "step": 525 + }, + { + "epoch": 0.06691260653860832, + "ewc_loss": 8.493661880493164e-07, + "grad_norm": 2.4865670204162598, + "learning_rate": 2.2255192878338279e-07, + "loss": 0.5993, + "mean_token_accuracy": 0.81597900390625, + "num_tokens": 20021081.0, + "step": 526 + }, + { + "epoch": 0.06703981681719882, + "ewc_loss": 8.530914783477783e-07, + "grad_norm": 2.8104798793792725, + "learning_rate": 2.2297583721916066e-07, + "loss": 0.57, + "mean_token_accuracy": 0.8202943801879883, + "num_tokens": 20055860.0, + "step": 527 + }, + { + "epoch": 0.06716702709578934, + "ewc_loss": 8.530914783477783e-07, + "grad_norm": 2.4323647022247314, + "learning_rate": 2.2339974565493853e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.835902214050293, + "num_tokens": 20095023.0, + "step": 528 + }, + { + "epoch": 0.06729423737437985, + "ewc_loss": 8.866190910339355e-07, + "grad_norm": 16.68150520324707, + "learning_rate": 2.238236540907164e-07, + "loss": 0.5811, + "mean_token_accuracy": 0.8187259435653687, + "num_tokens": 20134620.0, + "step": 529 + }, + { + "epoch": 0.06742144765297035, + "ewc_loss": 8.67992639541626e-07, + "grad_norm": 2.655592918395996, + "learning_rate": 2.2424756252649428e-07, + "loss": 0.5505, + "mean_token_accuracy": 0.8245222568511963, + "num_tokens": 20169690.0, + "step": 530 + }, + { + "epoch": 0.06754865793156087, + "ewc_loss": 8.67992639541626e-07, + "grad_norm": 2.9105684757232666, + "learning_rate": 2.2467147096227215e-07, + "loss": 0.5429, + "mean_token_accuracy": 0.8299115896224976, + "num_tokens": 20203609.0, + "step": 531 + }, + { + "epoch": 0.06767586821015138, + "ewc_loss": 8.67992639541626e-07, + "grad_norm": 2.812011241912842, + "learning_rate": 2.2509537939805002e-07, + "loss": 0.5488, + "mean_token_accuracy": 0.824640154838562, + "num_tokens": 20234616.0, + "step": 532 + }, + { + "epoch": 0.0678030784887419, + "ewc_loss": 8.717179298400879e-07, + "grad_norm": 2.345608949661255, + "learning_rate": 2.2551928783382787e-07, + "loss": 0.5409, + "mean_token_accuracy": 0.826587438583374, + "num_tokens": 20271634.0, + "step": 533 + }, + { + "epoch": 0.0679302887673324, + "ewc_loss": 8.717179298400879e-07, + "grad_norm": 2.538966178894043, + "learning_rate": 2.2594319626960577e-07, + "loss": 0.5687, + "mean_token_accuracy": 0.824821949005127, + "num_tokens": 20309683.0, + "step": 534 + }, + { + "epoch": 0.06805749904592291, + "ewc_loss": 8.717179298400879e-07, + "grad_norm": 2.3981781005859375, + "learning_rate": 2.263671047053836e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8417949676513672, + "num_tokens": 20346013.0, + "step": 535 + }, + { + "epoch": 0.06818470932451343, + "ewc_loss": 8.717179298400879e-07, + "grad_norm": 3.057840585708618, + "learning_rate": 2.267910131411615e-07, + "loss": 0.5397, + "mean_token_accuracy": 0.8326334953308105, + "num_tokens": 20387613.0, + "step": 536 + }, + { + "epoch": 0.06831191960310393, + "ewc_loss": 8.717179298400879e-07, + "grad_norm": 2.6792373657226562, + "learning_rate": 2.2721492157693936e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.8286234736442566, + "num_tokens": 20417579.0, + "step": 537 + }, + { + "epoch": 0.06843912988169444, + "ewc_loss": 8.717179298400879e-07, + "grad_norm": 2.168741464614868, + "learning_rate": 2.2763883001271726e-07, + "loss": 0.5118, + "mean_token_accuracy": 0.8356480002403259, + "num_tokens": 20455100.0, + "step": 538 + }, + { + "epoch": 0.06856634016028496, + "ewc_loss": 8.717179298400879e-07, + "grad_norm": 2.2447986602783203, + "learning_rate": 2.280627384484951e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.8185651302337646, + "num_tokens": 20496458.0, + "step": 539 + }, + { + "epoch": 0.06869355043887546, + "ewc_loss": 8.754432201385498e-07, + "grad_norm": 2.6183249950408936, + "learning_rate": 2.28486646884273e-07, + "loss": 0.548, + "mean_token_accuracy": 0.8256886005401611, + "num_tokens": 20533758.0, + "step": 540 + }, + { + "epoch": 0.06882076071746597, + "ewc_loss": 8.791685104370117e-07, + "grad_norm": 2.5690457820892334, + "learning_rate": 2.2891055532005085e-07, + "loss": 0.5929, + "mean_token_accuracy": 0.8163676857948303, + "num_tokens": 20567259.0, + "step": 541 + }, + { + "epoch": 0.06894797099605648, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 2.7926859855651855, + "learning_rate": 2.2933446375582875e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.8217257857322693, + "num_tokens": 20599333.0, + "step": 542 + }, + { + "epoch": 0.06907518127464699, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 2.382006883621216, + "learning_rate": 2.297583721916066e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.8381298184394836, + "num_tokens": 20640195.0, + "step": 543 + }, + { + "epoch": 0.0692023915532375, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 2.263057231903076, + "learning_rate": 2.301822806273845e-07, + "loss": 0.5323, + "mean_token_accuracy": 0.8317908644676208, + "num_tokens": 20682617.0, + "step": 544 + }, + { + "epoch": 0.06932960183182801, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 2.5558414459228516, + "learning_rate": 2.3060618906316234e-07, + "loss": 0.5827, + "mean_token_accuracy": 0.8207383155822754, + "num_tokens": 20718032.0, + "step": 545 + }, + { + "epoch": 0.06945681211041853, + "ewc_loss": 8.866190910339355e-07, + "grad_norm": 2.5719661712646484, + "learning_rate": 2.3103009749894024e-07, + "loss": 0.549, + "mean_token_accuracy": 0.8268862962722778, + "num_tokens": 20755190.0, + "step": 546 + }, + { + "epoch": 0.06958402238900903, + "ewc_loss": 8.866190910339355e-07, + "grad_norm": 2.1511738300323486, + "learning_rate": 2.3145400593471808e-07, + "loss": 0.5723, + "mean_token_accuracy": 0.8179306387901306, + "num_tokens": 20799088.0, + "step": 547 + }, + { + "epoch": 0.06971123266759954, + "ewc_loss": 8.866190910339355e-07, + "grad_norm": 2.7070722579956055, + "learning_rate": 2.3187791437049598e-07, + "loss": 0.6017, + "mean_token_accuracy": 0.8108012080192566, + "num_tokens": 20831819.0, + "step": 548 + }, + { + "epoch": 0.06983844294619006, + "ewc_loss": 8.866190910339355e-07, + "grad_norm": 2.6030290126800537, + "learning_rate": 2.3230182280627383e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8328275084495544, + "num_tokens": 20861291.0, + "step": 549 + }, + { + "epoch": 0.06996565322478056, + "ewc_loss": 8.940696716308594e-07, + "grad_norm": 2.2400879859924316, + "learning_rate": 2.327257312420517e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8409179449081421, + "num_tokens": 20897446.0, + "step": 550 + }, + { + "epoch": 0.07009286350337107, + "ewc_loss": 8.977949619293213e-07, + "grad_norm": 2.7164273262023926, + "learning_rate": 2.3314963967782957e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.8275951147079468, + "num_tokens": 20934423.0, + "step": 551 + }, + { + "epoch": 0.07022007378196159, + "ewc_loss": 9.015202522277832e-07, + "grad_norm": 2.349909543991089, + "learning_rate": 2.3357354811360745e-07, + "loss": 0.5515, + "mean_token_accuracy": 0.8296972513198853, + "num_tokens": 20978394.0, + "step": 552 + }, + { + "epoch": 0.07034728406055209, + "ewc_loss": 9.015202522277832e-07, + "grad_norm": 2.7455806732177734, + "learning_rate": 2.3399745654938532e-07, + "loss": 0.5756, + "mean_token_accuracy": 0.8205547332763672, + "num_tokens": 21010669.0, + "step": 553 + }, + { + "epoch": 0.0704744943391426, + "ewc_loss": 9.015202522277832e-07, + "grad_norm": 2.196019172668457, + "learning_rate": 2.344213649851632e-07, + "loss": 0.5455, + "mean_token_accuracy": 0.8296135663986206, + "num_tokens": 21047696.0, + "step": 554 + }, + { + "epoch": 0.07060170461773312, + "ewc_loss": 9.08970832824707e-07, + "grad_norm": 2.1608505249023438, + "learning_rate": 2.3484527342094106e-07, + "loss": 0.6266, + "mean_token_accuracy": 0.8077785968780518, + "num_tokens": 21091437.0, + "step": 555 + }, + { + "epoch": 0.07072891489632362, + "ewc_loss": 9.08970832824707e-07, + "grad_norm": 2.2104337215423584, + "learning_rate": 2.3526918185671894e-07, + "loss": 0.5876, + "mean_token_accuracy": 0.8170924186706543, + "num_tokens": 21128880.0, + "step": 556 + }, + { + "epoch": 0.07085612517491413, + "ewc_loss": 9.08970832824707e-07, + "grad_norm": 2.24660325050354, + "learning_rate": 2.356930902924968e-07, + "loss": 0.538, + "mean_token_accuracy": 0.83146733045578, + "num_tokens": 21168523.0, + "step": 557 + }, + { + "epoch": 0.07098333545350465, + "ewc_loss": 9.164214134216309e-07, + "grad_norm": 2.389547824859619, + "learning_rate": 2.3611699872827468e-07, + "loss": 0.6235, + "mean_token_accuracy": 0.8041018843650818, + "num_tokens": 21211376.0, + "step": 558 + }, + { + "epoch": 0.07111054573209516, + "ewc_loss": 9.201467037200928e-07, + "grad_norm": 2.141681671142578, + "learning_rate": 2.3654090716405255e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8238940238952637, + "num_tokens": 21249162.0, + "step": 559 + }, + { + "epoch": 0.07123775601068566, + "ewc_loss": 9.201467037200928e-07, + "grad_norm": 2.5477662086486816, + "learning_rate": 2.3696481559983043e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8164481520652771, + "num_tokens": 21282351.0, + "step": 560 + }, + { + "epoch": 0.07136496628927617, + "ewc_loss": 9.238719940185547e-07, + "grad_norm": 2.3285796642303467, + "learning_rate": 2.373887240356083e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.819928765296936, + "num_tokens": 21319508.0, + "step": 561 + }, + { + "epoch": 0.07149217656786669, + "ewc_loss": 9.238719940185547e-07, + "grad_norm": 2.143578052520752, + "learning_rate": 2.3781263247138617e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.8312370777130127, + "num_tokens": 21357111.0, + "step": 562 + }, + { + "epoch": 0.07161938684645719, + "ewc_loss": 9.275972843170166e-07, + "grad_norm": 2.1651771068573, + "learning_rate": 2.3823654090716404e-07, + "loss": 0.541, + "mean_token_accuracy": 0.8269907832145691, + "num_tokens": 21393222.0, + "step": 563 + }, + { + "epoch": 0.0717465971250477, + "ewc_loss": 9.275972843170166e-07, + "grad_norm": 2.113020420074463, + "learning_rate": 2.386604493429419e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8401475548744202, + "num_tokens": 21429595.0, + "step": 564 + }, + { + "epoch": 0.07187380740363822, + "ewc_loss": 9.275972843170166e-07, + "grad_norm": 2.259795904159546, + "learning_rate": 2.390843577787198e-07, + "loss": 0.5908, + "mean_token_accuracy": 0.8108585476875305, + "num_tokens": 21467341.0, + "step": 565 + }, + { + "epoch": 0.07200101768222872, + "ewc_loss": 9.275972843170166e-07, + "grad_norm": 1.9037901163101196, + "learning_rate": 2.3950826621449766e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8511619567871094, + "num_tokens": 21510927.0, + "step": 566 + }, + { + "epoch": 0.07212822796081923, + "ewc_loss": 9.275972843170166e-07, + "grad_norm": 2.405524730682373, + "learning_rate": 2.3993217465027556e-07, + "loss": 0.5455, + "mean_token_accuracy": 0.8293595910072327, + "num_tokens": 21551604.0, + "step": 567 + }, + { + "epoch": 0.07225543823940975, + "ewc_loss": 9.275972843170166e-07, + "grad_norm": 2.5312631130218506, + "learning_rate": 2.403560830860534e-07, + "loss": 0.5949, + "mean_token_accuracy": 0.8175463080406189, + "num_tokens": 21591871.0, + "step": 568 + }, + { + "epoch": 0.07238264851800025, + "ewc_loss": 9.350478649139404e-07, + "grad_norm": 2.450627326965332, + "learning_rate": 2.4077999152183125e-07, + "loss": 0.5928, + "mean_token_accuracy": 0.8145543336868286, + "num_tokens": 21629810.0, + "step": 569 + }, + { + "epoch": 0.07250985879659076, + "ewc_loss": 9.387731552124023e-07, + "grad_norm": 2.0883660316467285, + "learning_rate": 2.4120389995760915e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8466152548789978, + "num_tokens": 21668706.0, + "step": 570 + }, + { + "epoch": 0.07263706907518128, + "ewc_loss": 9.387731552124023e-07, + "grad_norm": 2.1684305667877197, + "learning_rate": 2.41627808393387e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8288992047309875, + "num_tokens": 21712725.0, + "step": 571 + }, + { + "epoch": 0.07276427935377179, + "ewc_loss": 9.387731552124023e-07, + "grad_norm": 2.28550124168396, + "learning_rate": 2.420517168291649e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8309670090675354, + "num_tokens": 21746086.0, + "step": 572 + }, + { + "epoch": 0.07289148963236229, + "ewc_loss": 9.462237358093262e-07, + "grad_norm": 2.4685561656951904, + "learning_rate": 2.4247562526494274e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.837875247001648, + "num_tokens": 21779573.0, + "step": 573 + }, + { + "epoch": 0.0730186999109528, + "ewc_loss": 9.424984455108643e-07, + "grad_norm": 2.032519578933716, + "learning_rate": 2.4289953370072064e-07, + "loss": 0.5767, + "mean_token_accuracy": 0.8179342746734619, + "num_tokens": 21818766.0, + "step": 574 + }, + { + "epoch": 0.07314591018954332, + "ewc_loss": 9.462237358093262e-07, + "grad_norm": 2.0849292278289795, + "learning_rate": 2.433234421364985e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.8356374502182007, + "num_tokens": 21855539.0, + "step": 575 + }, + { + "epoch": 0.07327312046813382, + "ewc_loss": 9.462237358093262e-07, + "grad_norm": 2.016902208328247, + "learning_rate": 2.437473505722764e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8397610783576965, + "num_tokens": 21892567.0, + "step": 576 + }, + { + "epoch": 0.07340033074672433, + "ewc_loss": 9.5367431640625e-07, + "grad_norm": 2.3538401126861572, + "learning_rate": 2.4417125900805423e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8287360668182373, + "num_tokens": 21928607.0, + "step": 577 + }, + { + "epoch": 0.07352754102531485, + "ewc_loss": 9.5367431640625e-07, + "grad_norm": 2.307678699493408, + "learning_rate": 2.4459516744383213e-07, + "loss": 0.5642, + "mean_token_accuracy": 0.8229278326034546, + "num_tokens": 21963781.0, + "step": 578 + }, + { + "epoch": 0.07365475130390535, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 2.1115458011627197, + "learning_rate": 2.4501907587961e-07, + "loss": 0.5244, + "mean_token_accuracy": 0.835269570350647, + "num_tokens": 21999140.0, + "step": 579 + }, + { + "epoch": 0.07378196158249586, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 1.9687249660491943, + "learning_rate": 2.454429843153879e-07, + "loss": 0.623, + "mean_token_accuracy": 0.8036316633224487, + "num_tokens": 22038703.0, + "step": 580 + }, + { + "epoch": 0.07390917186108638, + "ewc_loss": 9.685754776000977e-07, + "grad_norm": 2.338937520980835, + "learning_rate": 2.458668927511657e-07, + "loss": 0.5385, + "mean_token_accuracy": 0.8228963017463684, + "num_tokens": 22071052.0, + "step": 581 + }, + { + "epoch": 0.07403638213967688, + "ewc_loss": 9.760260581970215e-07, + "grad_norm": 2.68318510055542, + "learning_rate": 2.462908011869436e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8400887846946716, + "num_tokens": 22109629.0, + "step": 582 + }, + { + "epoch": 0.0741635924182674, + "ewc_loss": 9.760260581970215e-07, + "grad_norm": 2.025428056716919, + "learning_rate": 2.4671470962272147e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8373210430145264, + "num_tokens": 22151544.0, + "step": 583 + }, + { + "epoch": 0.07429080269685791, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 2.1194586753845215, + "learning_rate": 2.4713861805849937e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8411158919334412, + "num_tokens": 22189241.0, + "step": 584 + }, + { + "epoch": 0.07441801297544842, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 1.916947364807129, + "learning_rate": 2.475625264942772e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8378486633300781, + "num_tokens": 22229138.0, + "step": 585 + }, + { + "epoch": 0.07454522325403892, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 2.1600186824798584, + "learning_rate": 2.479864349300551e-07, + "loss": 0.5462, + "mean_token_accuracy": 0.8283840417861938, + "num_tokens": 22264930.0, + "step": 586 + }, + { + "epoch": 0.07467243353262944, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 2.3061044216156006, + "learning_rate": 2.4841034336583296e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8389479517936707, + "num_tokens": 22297162.0, + "step": 587 + }, + { + "epoch": 0.07479964381121995, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 2.072937250137329, + "learning_rate": 2.488342518016108e-07, + "loss": 0.5709, + "mean_token_accuracy": 0.8167726993560791, + "num_tokens": 22334779.0, + "step": 588 + }, + { + "epoch": 0.07492685408981045, + "ewc_loss": 9.909272193908691e-07, + "grad_norm": 2.1041791439056396, + "learning_rate": 2.492581602373887e-07, + "loss": 0.5305, + "mean_token_accuracy": 0.8333460688591003, + "num_tokens": 22372820.0, + "step": 589 + }, + { + "epoch": 0.07505406436840097, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 1.9774200916290283, + "learning_rate": 2.4968206867316655e-07, + "loss": 0.5499, + "mean_token_accuracy": 0.8261951208114624, + "num_tokens": 22417841.0, + "step": 590 + }, + { + "epoch": 0.07518127464699148, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.114630699157715, + "learning_rate": 2.5010597710894445e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.8523421883583069, + "num_tokens": 22449133.0, + "step": 591 + }, + { + "epoch": 0.07530848492558198, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.0650298595428467, + "learning_rate": 2.505298855447223e-07, + "loss": 0.53, + "mean_token_accuracy": 0.8333895206451416, + "num_tokens": 22491145.0, + "step": 592 + }, + { + "epoch": 0.0754356952041725, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 9.892422676086426, + "learning_rate": 2.509537939805002e-07, + "loss": 0.5422, + "mean_token_accuracy": 0.8301504850387573, + "num_tokens": 22531738.0, + "step": 593 + }, + { + "epoch": 0.07556290548276301, + "ewc_loss": 1.0132789611816406e-06, + "grad_norm": 2.536193370819092, + "learning_rate": 2.513777024162781e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8364499807357788, + "num_tokens": 22563005.0, + "step": 594 + }, + { + "epoch": 0.07569011576135352, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 1.8493947982788086, + "learning_rate": 2.5180161085205594e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8409795761108398, + "num_tokens": 22601333.0, + "step": 595 + }, + { + "epoch": 0.07581732603994402, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 2.130152702331543, + "learning_rate": 2.522255192878338e-07, + "loss": 0.5549, + "mean_token_accuracy": 0.8236430883407593, + "num_tokens": 22639528.0, + "step": 596 + }, + { + "epoch": 0.07594453631853454, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 1.9977797269821167, + "learning_rate": 2.526494277236117e-07, + "loss": 0.5384, + "mean_token_accuracy": 0.8308155536651611, + "num_tokens": 22675290.0, + "step": 597 + }, + { + "epoch": 0.07607174659712505, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 2.032132148742676, + "learning_rate": 2.530733361593896e-07, + "loss": 0.5147, + "mean_token_accuracy": 0.8349794149398804, + "num_tokens": 22712823.0, + "step": 598 + }, + { + "epoch": 0.07619895687571555, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 2.0179755687713623, + "learning_rate": 2.5349724459516743e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.830488383769989, + "num_tokens": 22750916.0, + "step": 599 + }, + { + "epoch": 0.07632616715430607, + "ewc_loss": 1.0281801223754883e-06, + "grad_norm": 2.2998363971710205, + "learning_rate": 2.539211530309453e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8419886231422424, + "num_tokens": 22782537.0, + "step": 600 + }, + { + "epoch": 0.07645337743289658, + "ewc_loss": 1.0281801223754883e-06, + "grad_norm": 2.4413304328918457, + "learning_rate": 2.543450614667232e-07, + "loss": 0.5781, + "mean_token_accuracy": 0.8183180689811707, + "num_tokens": 22814805.0, + "step": 601 + }, + { + "epoch": 0.07658058771148708, + "ewc_loss": 1.0281801223754883e-06, + "grad_norm": 2.1946163177490234, + "learning_rate": 2.547689699025011e-07, + "loss": 0.5771, + "mean_token_accuracy": 0.8177695870399475, + "num_tokens": 22852654.0, + "step": 602 + }, + { + "epoch": 0.0767077979900776, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.044671058654785, + "learning_rate": 2.551928783382789e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8293642997741699, + "num_tokens": 22894205.0, + "step": 603 + }, + { + "epoch": 0.07683500826866811, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.1175873279571533, + "learning_rate": 2.5561678677405677e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8403069376945496, + "num_tokens": 22924095.0, + "step": 604 + }, + { + "epoch": 0.07696221854725861, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.02811598777771, + "learning_rate": 2.5604069520983467e-07, + "loss": 0.5785, + "mean_token_accuracy": 0.8157904148101807, + "num_tokens": 22960857.0, + "step": 605 + }, + { + "epoch": 0.07708942882584913, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 1.8592579364776611, + "learning_rate": 2.564646036456125e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.8281331658363342, + "num_tokens": 23004670.0, + "step": 606 + }, + { + "epoch": 0.07721663910443964, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.1434824466705322, + "learning_rate": 2.568885120813904e-07, + "loss": 0.5317, + "mean_token_accuracy": 0.8293766975402832, + "num_tokens": 23041999.0, + "step": 607 + }, + { + "epoch": 0.07734384938303016, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 1.9641133546829224, + "learning_rate": 2.5731242051716826e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8366116285324097, + "num_tokens": 23080704.0, + "step": 608 + }, + { + "epoch": 0.07747105966162066, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 2.3526768684387207, + "learning_rate": 2.5773632895294616e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8263466358184814, + "num_tokens": 23118662.0, + "step": 609 + }, + { + "epoch": 0.07759826994021117, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 2.0280299186706543, + "learning_rate": 2.58160237388724e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8441120386123657, + "num_tokens": 23150367.0, + "step": 610 + }, + { + "epoch": 0.07772548021880168, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 2.22943377494812, + "learning_rate": 2.585841458245019e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8291909694671631, + "num_tokens": 23185836.0, + "step": 611 + }, + { + "epoch": 0.07785269049739219, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 1.9650459289550781, + "learning_rate": 2.5900805426027975e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8466414213180542, + "num_tokens": 23219346.0, + "step": 612 + }, + { + "epoch": 0.0779799007759827, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 1.9990428686141968, + "learning_rate": 2.5943196269605765e-07, + "loss": 0.548, + "mean_token_accuracy": 0.8327380418777466, + "num_tokens": 23257569.0, + "step": 613 + }, + { + "epoch": 0.07810711105457321, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 1.7596073150634766, + "learning_rate": 2.598558711318355e-07, + "loss": 0.5403, + "mean_token_accuracy": 0.8312442302703857, + "num_tokens": 23302944.0, + "step": 614 + }, + { + "epoch": 0.07823432133316371, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 1.96662175655365, + "learning_rate": 2.602797795676134e-07, + "loss": 0.5467, + "mean_token_accuracy": 0.8272722363471985, + "num_tokens": 23341660.0, + "step": 615 + }, + { + "epoch": 0.07836153161175423, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 2.0587239265441895, + "learning_rate": 2.6070368800339124e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8238033652305603, + "num_tokens": 23375956.0, + "step": 616 + }, + { + "epoch": 0.07848874189034474, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 1.9979593753814697, + "learning_rate": 2.6112759643916914e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8416213989257812, + "num_tokens": 23407568.0, + "step": 617 + }, + { + "epoch": 0.07861595216893524, + "ewc_loss": 1.0579824447631836e-06, + "grad_norm": 2.177793502807617, + "learning_rate": 2.61551504874947e-07, + "loss": 0.5255, + "mean_token_accuracy": 0.8326936960220337, + "num_tokens": 23440261.0, + "step": 618 + }, + { + "epoch": 0.07874316244752576, + "ewc_loss": 1.0579824447631836e-06, + "grad_norm": 1.940701961517334, + "learning_rate": 2.619754133107249e-07, + "loss": 0.5103, + "mean_token_accuracy": 0.8387783169746399, + "num_tokens": 23481706.0, + "step": 619 + }, + { + "epoch": 0.07887037272611627, + "ewc_loss": 1.0579824447631836e-06, + "grad_norm": 2.222728729248047, + "learning_rate": 2.623993217465028e-07, + "loss": 0.5696, + "mean_token_accuracy": 0.8170429468154907, + "num_tokens": 23515410.0, + "step": 620 + }, + { + "epoch": 0.07899758300470679, + "ewc_loss": 1.0728836059570312e-06, + "grad_norm": 1.94465172290802, + "learning_rate": 2.6282323018228063e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.828661322593689, + "num_tokens": 23554627.0, + "step": 621 + }, + { + "epoch": 0.07912479328329729, + "ewc_loss": 1.087784767150879e-06, + "grad_norm": 1.9507476091384888, + "learning_rate": 2.632471386180585e-07, + "loss": 0.577, + "mean_token_accuracy": 0.8167271614074707, + "num_tokens": 23593400.0, + "step": 622 + }, + { + "epoch": 0.0792520035618878, + "ewc_loss": 1.087784767150879e-06, + "grad_norm": 2.329629898071289, + "learning_rate": 2.6367104705383637e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8175866603851318, + "num_tokens": 23628242.0, + "step": 623 + }, + { + "epoch": 0.07937921384047832, + "ewc_loss": 1.087784767150879e-06, + "grad_norm": 1.878658652305603, + "learning_rate": 2.6409495548961427e-07, + "loss": 0.5606, + "mean_token_accuracy": 0.8260926604270935, + "num_tokens": 23671041.0, + "step": 624 + }, + { + "epoch": 0.07950642411906882, + "ewc_loss": 1.087784767150879e-06, + "grad_norm": 2.312023639678955, + "learning_rate": 2.6451886392539206e-07, + "loss": 0.5379, + "mean_token_accuracy": 0.8324928283691406, + "num_tokens": 23701904.0, + "step": 625 + }, + { + "epoch": 0.07963363439765933, + "ewc_loss": 1.087784767150879e-06, + "grad_norm": 1.9116731882095337, + "learning_rate": 2.6494277236116996e-07, + "loss": 0.5028, + "mean_token_accuracy": 0.8393506407737732, + "num_tokens": 23742374.0, + "step": 626 + }, + { + "epoch": 0.07976084467624985, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 2.082223415374756, + "learning_rate": 2.6536668079694786e-07, + "loss": 0.6072, + "mean_token_accuracy": 0.8099650144577026, + "num_tokens": 23780216.0, + "step": 627 + }, + { + "epoch": 0.07988805495484035, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 2.0895912647247314, + "learning_rate": 2.6579058923272576e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.8524177670478821, + "num_tokens": 23813641.0, + "step": 628 + }, + { + "epoch": 0.08001526523343086, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 2.0724008083343506, + "learning_rate": 2.6621449766850356e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8274329900741577, + "num_tokens": 23850189.0, + "step": 629 + }, + { + "epoch": 0.08014247551202137, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.9312529563903809, + "learning_rate": 2.6663840610428145e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8394320011138916, + "num_tokens": 23884410.0, + "step": 630 + }, + { + "epoch": 0.08026968579061187, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.8380101919174194, + "learning_rate": 2.6706231454005935e-07, + "loss": 0.5102, + "mean_token_accuracy": 0.8377610445022583, + "num_tokens": 23930139.0, + "step": 631 + }, + { + "epoch": 0.08039689606920239, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.743642807006836, + "learning_rate": 2.6748622297583725e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8383902311325073, + "num_tokens": 23973124.0, + "step": 632 + }, + { + "epoch": 0.0805241063477929, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.878254771232605, + "learning_rate": 2.6791013141161505e-07, + "loss": 0.5174, + "mean_token_accuracy": 0.8376110792160034, + "num_tokens": 24008676.0, + "step": 633 + }, + { + "epoch": 0.08065131662638342, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 2.169037103652954, + "learning_rate": 2.6833403984739294e-07, + "loss": 0.518, + "mean_token_accuracy": 0.8350719213485718, + "num_tokens": 24044880.0, + "step": 634 + }, + { + "epoch": 0.08077852690497392, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.709686040878296, + "learning_rate": 2.6875794828317084e-07, + "loss": 0.5074, + "mean_token_accuracy": 0.8419762253761292, + "num_tokens": 24090140.0, + "step": 635 + }, + { + "epoch": 0.08090573718356443, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 2.4554829597473145, + "learning_rate": 2.6918185671894874e-07, + "loss": 0.5897, + "mean_token_accuracy": 0.8177673816680908, + "num_tokens": 24123311.0, + "step": 636 + }, + { + "epoch": 0.08103294746215495, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.8811172246932983, + "learning_rate": 2.6960576515472654e-07, + "loss": 0.6241, + "mean_token_accuracy": 0.8032007813453674, + "num_tokens": 24164019.0, + "step": 637 + }, + { + "epoch": 0.08116015774074545, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.775919795036316, + "learning_rate": 2.7002967359050443e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8440866470336914, + "num_tokens": 24205986.0, + "step": 638 + }, + { + "epoch": 0.08128736801933596, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.7722575664520264, + "learning_rate": 2.7045358202628233e-07, + "loss": 0.5484, + "mean_token_accuracy": 0.8232289552688599, + "num_tokens": 24251956.0, + "step": 639 + }, + { + "epoch": 0.08141457829792648, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.9126743078231812, + "learning_rate": 2.7087749046206023e-07, + "loss": 0.4963, + "mean_token_accuracy": 0.8401033878326416, + "num_tokens": 24284990.0, + "step": 640 + }, + { + "epoch": 0.08154178857651698, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 1.9557973146438599, + "learning_rate": 2.71301398897838e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8245285749435425, + "num_tokens": 24323305.0, + "step": 641 + }, + { + "epoch": 0.08166899885510749, + "ewc_loss": 1.1101365089416504e-06, + "grad_norm": 1.8217984437942505, + "learning_rate": 2.717253073336159e-07, + "loss": 0.4643, + "mean_token_accuracy": 0.850118100643158, + "num_tokens": 24365847.0, + "step": 642 + }, + { + "epoch": 0.081796209133698, + "ewc_loss": 1.1175870895385742e-06, + "grad_norm": 1.7539787292480469, + "learning_rate": 2.721492157693938e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8485817909240723, + "num_tokens": 24407269.0, + "step": 643 + }, + { + "epoch": 0.0819234194122885, + "ewc_loss": 1.1175870895385742e-06, + "grad_norm": 1.8604520559310913, + "learning_rate": 2.7257312420517167e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8328819274902344, + "num_tokens": 24444990.0, + "step": 644 + }, + { + "epoch": 0.08205062969087902, + "ewc_loss": 1.1175870895385742e-06, + "grad_norm": 1.7943569421768188, + "learning_rate": 2.729970326409495e-07, + "loss": 0.555, + "mean_token_accuracy": 0.8264642953872681, + "num_tokens": 24487293.0, + "step": 645 + }, + { + "epoch": 0.08217783996946953, + "ewc_loss": 1.1175870895385742e-06, + "grad_norm": 1.9847848415374756, + "learning_rate": 2.734209410767274e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8358452916145325, + "num_tokens": 24521403.0, + "step": 646 + }, + { + "epoch": 0.08230505024806005, + "ewc_loss": 1.125037670135498e-06, + "grad_norm": 1.8924334049224854, + "learning_rate": 2.738448495125053e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.8351922035217285, + "num_tokens": 24562740.0, + "step": 647 + }, + { + "epoch": 0.08243226052665055, + "ewc_loss": 1.125037670135498e-06, + "grad_norm": 2.0575509071350098, + "learning_rate": 2.7426875794828316e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8265653848648071, + "num_tokens": 24597507.0, + "step": 648 + }, + { + "epoch": 0.08255947080524106, + "ewc_loss": 1.125037670135498e-06, + "grad_norm": 1.944737195968628, + "learning_rate": 2.74692666384061e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8390342593193054, + "num_tokens": 24635629.0, + "step": 649 + }, + { + "epoch": 0.08268668108383158, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 1.81252920627594, + "learning_rate": 2.751165748198389e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8321808576583862, + "num_tokens": 24676087.0, + "step": 650 + }, + { + "epoch": 0.08281389136242208, + "ewc_loss": 1.125037670135498e-06, + "grad_norm": 2.0121397972106934, + "learning_rate": 2.755404832556168e-07, + "loss": 0.5389, + "mean_token_accuracy": 0.828900158405304, + "num_tokens": 24713652.0, + "step": 651 + }, + { + "epoch": 0.0829411016410126, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 1.8867868185043335, + "learning_rate": 2.7596439169139465e-07, + "loss": 0.5225, + "mean_token_accuracy": 0.8354319334030151, + "num_tokens": 24750130.0, + "step": 652 + }, + { + "epoch": 0.08306831191960311, + "ewc_loss": 1.1399388313293457e-06, + "grad_norm": 1.9736220836639404, + "learning_rate": 2.763883001271725e-07, + "loss": 0.5715, + "mean_token_accuracy": 0.8189929723739624, + "num_tokens": 24787337.0, + "step": 653 + }, + { + "epoch": 0.08319552219819361, + "ewc_loss": 1.1399388313293457e-06, + "grad_norm": 2.0915608406066895, + "learning_rate": 2.768122085629504e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8160289525985718, + "num_tokens": 24822736.0, + "step": 654 + }, + { + "epoch": 0.08332273247678412, + "ewc_loss": 1.1399388313293457e-06, + "grad_norm": 1.9773740768432617, + "learning_rate": 2.772361169987283e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8416378498077393, + "num_tokens": 24858743.0, + "step": 655 + }, + { + "epoch": 0.08344994275537464, + "ewc_loss": 1.1399388313293457e-06, + "grad_norm": 1.7878146171569824, + "learning_rate": 2.7766002543450614e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.8329986929893494, + "num_tokens": 24901286.0, + "step": 656 + }, + { + "epoch": 0.08357715303396514, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 1.9155948162078857, + "learning_rate": 2.78083933870284e-07, + "loss": 0.5815, + "mean_token_accuracy": 0.8178554177284241, + "num_tokens": 24943457.0, + "step": 657 + }, + { + "epoch": 0.08370436331255565, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 1.9233098030090332, + "learning_rate": 2.785078423060619e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8302665948867798, + "num_tokens": 24979247.0, + "step": 658 + }, + { + "epoch": 0.08383157359114617, + "ewc_loss": 1.1548399925231934e-06, + "grad_norm": 1.8870649337768555, + "learning_rate": 2.789317507418398e-07, + "loss": 0.536, + "mean_token_accuracy": 0.8293519020080566, + "num_tokens": 25017456.0, + "step": 659 + }, + { + "epoch": 0.08395878386973668, + "ewc_loss": 1.1548399925231934e-06, + "grad_norm": 1.9355496168136597, + "learning_rate": 2.7935565917761763e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.8402693271636963, + "num_tokens": 25054682.0, + "step": 660 + }, + { + "epoch": 0.08408599414832718, + "ewc_loss": 1.1548399925231934e-06, + "grad_norm": 2.1686925888061523, + "learning_rate": 2.797795676133955e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8190574645996094, + "num_tokens": 25092501.0, + "step": 661 + }, + { + "epoch": 0.0842132044269177, + "ewc_loss": 1.1622905731201172e-06, + "grad_norm": 1.9743072986602783, + "learning_rate": 2.802034760491734e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.8601087927818298, + "num_tokens": 25129253.0, + "step": 662 + }, + { + "epoch": 0.08434041470550821, + "ewc_loss": 1.1548399925231934e-06, + "grad_norm": 1.736746072769165, + "learning_rate": 2.806273844849512e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8494622707366943, + "num_tokens": 25169119.0, + "step": 663 + }, + { + "epoch": 0.08446762498409871, + "ewc_loss": 1.1548399925231934e-06, + "grad_norm": 1.925126552581787, + "learning_rate": 2.810512929207291e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.8223632574081421, + "num_tokens": 25207654.0, + "step": 664 + }, + { + "epoch": 0.08459483526268922, + "ewc_loss": 1.1771917343139648e-06, + "grad_norm": 1.8454777002334595, + "learning_rate": 2.8147520135650697e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8403327465057373, + "num_tokens": 25243258.0, + "step": 665 + }, + { + "epoch": 0.08472204554127974, + "ewc_loss": 1.1771917343139648e-06, + "grad_norm": 1.8305630683898926, + "learning_rate": 2.8189910979228487e-07, + "loss": 0.5284, + "mean_token_accuracy": 0.8331127166748047, + "num_tokens": 25281301.0, + "step": 666 + }, + { + "epoch": 0.08484925581987024, + "ewc_loss": 1.1771917343139648e-06, + "grad_norm": 2.018576145172119, + "learning_rate": 2.823230182280627e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8466088771820068, + "num_tokens": 25313500.0, + "step": 667 + }, + { + "epoch": 0.08497646609846075, + "ewc_loss": 1.1771917343139648e-06, + "grad_norm": 1.8469109535217285, + "learning_rate": 2.827469266638406e-07, + "loss": 0.5169, + "mean_token_accuracy": 0.8347123861312866, + "num_tokens": 25355121.0, + "step": 668 + }, + { + "epoch": 0.08510367637705127, + "ewc_loss": 1.1771917343139648e-06, + "grad_norm": 1.7627034187316895, + "learning_rate": 2.8317083509961846e-07, + "loss": 0.5323, + "mean_token_accuracy": 0.8314377665519714, + "num_tokens": 25398868.0, + "step": 669 + }, + { + "epoch": 0.08523088665564178, + "ewc_loss": 1.1771917343139648e-06, + "grad_norm": 2.0932981967926025, + "learning_rate": 2.8359474353539636e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8356409668922424, + "num_tokens": 25438670.0, + "step": 670 + }, + { + "epoch": 0.08535809693423228, + "ewc_loss": 1.1995434761047363e-06, + "grad_norm": 1.9355626106262207, + "learning_rate": 2.840186519711742e-07, + "loss": 0.558, + "mean_token_accuracy": 0.8262837529182434, + "num_tokens": 25479243.0, + "step": 671 + }, + { + "epoch": 0.0854853072128228, + "ewc_loss": 1.1846423149108887e-06, + "grad_norm": 1.9342459440231323, + "learning_rate": 2.844425604069521e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.8270067572593689, + "num_tokens": 25514615.0, + "step": 672 + }, + { + "epoch": 0.08561251749141331, + "ewc_loss": 1.1995434761047363e-06, + "grad_norm": 1.9677084684371948, + "learning_rate": 2.8486646884272995e-07, + "loss": 0.5187, + "mean_token_accuracy": 0.8336912393569946, + "num_tokens": 25552753.0, + "step": 673 + }, + { + "epoch": 0.08573972777000381, + "ewc_loss": 1.1995434761047363e-06, + "grad_norm": 1.9252357482910156, + "learning_rate": 2.8529037727850785e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8510570526123047, + "num_tokens": 25587415.0, + "step": 674 + }, + { + "epoch": 0.08586693804859433, + "ewc_loss": 1.1995434761047363e-06, + "grad_norm": 1.8048579692840576, + "learning_rate": 2.857142857142857e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8462666273117065, + "num_tokens": 25625962.0, + "step": 675 + }, + { + "epoch": 0.08599414832718484, + "ewc_loss": 1.2069940567016602e-06, + "grad_norm": 1.9667948484420776, + "learning_rate": 2.861381941500636e-07, + "loss": 0.5339, + "mean_token_accuracy": 0.8294416069984436, + "num_tokens": 25669680.0, + "step": 676 + }, + { + "epoch": 0.08612135860577534, + "ewc_loss": 1.2069940567016602e-06, + "grad_norm": 1.920656681060791, + "learning_rate": 2.8656210258584144e-07, + "loss": 0.5679, + "mean_token_accuracy": 0.8157346248626709, + "num_tokens": 25709221.0, + "step": 677 + }, + { + "epoch": 0.08624856888436586, + "ewc_loss": 1.2069940567016602e-06, + "grad_norm": 2.005793333053589, + "learning_rate": 2.869860110216193e-07, + "loss": 0.5695, + "mean_token_accuracy": 0.8166475892066956, + "num_tokens": 25741880.0, + "step": 678 + }, + { + "epoch": 0.08637577916295637, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 1.8000222444534302, + "learning_rate": 2.874099194573972e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.839015781879425, + "num_tokens": 25786282.0, + "step": 679 + }, + { + "epoch": 0.08650298944154687, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 1.9464881420135498, + "learning_rate": 2.878338278931751e-07, + "loss": 0.5476, + "mean_token_accuracy": 0.827908992767334, + "num_tokens": 25820229.0, + "step": 680 + }, + { + "epoch": 0.08663019972013739, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 1.8703807592391968, + "learning_rate": 2.8825773632895293e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8484534621238708, + "num_tokens": 25856465.0, + "step": 681 + }, + { + "epoch": 0.0867574099987279, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 1.7389811277389526, + "learning_rate": 2.886816447647308e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8446571230888367, + "num_tokens": 25897407.0, + "step": 682 + }, + { + "epoch": 0.08688462027731841, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 1.9689834117889404, + "learning_rate": 2.891055532005087e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8382961750030518, + "num_tokens": 25938825.0, + "step": 683 + }, + { + "epoch": 0.08701183055590891, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 1.985466480255127, + "learning_rate": 2.8952946163628657e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8271924257278442, + "num_tokens": 25974944.0, + "step": 684 + }, + { + "epoch": 0.08713904083449943, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 1.876882791519165, + "learning_rate": 2.899533700720644e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8393163681030273, + "num_tokens": 26012864.0, + "step": 685 + }, + { + "epoch": 0.08726625111308994, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 1.831639051437378, + "learning_rate": 2.9037727850784227e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.828172504901886, + "num_tokens": 26049779.0, + "step": 686 + }, + { + "epoch": 0.08739346139168044, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 1.892540454864502, + "learning_rate": 2.9080118694362016e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8377052545547485, + "num_tokens": 26089095.0, + "step": 687 + }, + { + "epoch": 0.08752067167027096, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 1.717938780784607, + "learning_rate": 2.9122509537939806e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8383570909500122, + "num_tokens": 26132553.0, + "step": 688 + }, + { + "epoch": 0.08764788194886147, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 1.9243953227996826, + "learning_rate": 2.916490038151759e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8177449703216553, + "num_tokens": 26171301.0, + "step": 689 + }, + { + "epoch": 0.08777509222745197, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 1.8062856197357178, + "learning_rate": 2.9207291225095376e-07, + "loss": 0.5482, + "mean_token_accuracy": 0.8236544132232666, + "num_tokens": 26217871.0, + "step": 690 + }, + { + "epoch": 0.08790230250604249, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 1.7644578218460083, + "learning_rate": 2.9249682068673166e-07, + "loss": 0.5127, + "mean_token_accuracy": 0.8351151347160339, + "num_tokens": 26255927.0, + "step": 691 + }, + { + "epoch": 0.088029512784633, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 1.7687445878982544, + "learning_rate": 2.9292072912250955e-07, + "loss": 0.495, + "mean_token_accuracy": 0.8435022830963135, + "num_tokens": 26296041.0, + "step": 692 + }, + { + "epoch": 0.0881567230632235, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 1.9535168409347534, + "learning_rate": 2.933446375582874e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8180407285690308, + "num_tokens": 26333785.0, + "step": 693 + }, + { + "epoch": 0.08828393334181402, + "ewc_loss": 1.2516975402832031e-06, + "grad_norm": 1.844524621963501, + "learning_rate": 2.9376854599406525e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8420708775520325, + "num_tokens": 26368837.0, + "step": 694 + }, + { + "epoch": 0.08841114362040453, + "ewc_loss": 1.2516975402832031e-06, + "grad_norm": 1.9737039804458618, + "learning_rate": 2.9419245442984315e-07, + "loss": 0.5573, + "mean_token_accuracy": 0.8221763372421265, + "num_tokens": 26402151.0, + "step": 695 + }, + { + "epoch": 0.08853835389899505, + "ewc_loss": 1.2665987014770508e-06, + "grad_norm": 1.8516956567764282, + "learning_rate": 2.9461636286562104e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8338582515716553, + "num_tokens": 26435550.0, + "step": 696 + }, + { + "epoch": 0.08866556417758555, + "ewc_loss": 1.2665987014770508e-06, + "grad_norm": 1.6820927858352661, + "learning_rate": 2.9504027130139884e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.839464545249939, + "num_tokens": 26479299.0, + "step": 697 + }, + { + "epoch": 0.08879277445617606, + "ewc_loss": 1.2814998626708984e-06, + "grad_norm": 1.788969874382019, + "learning_rate": 2.9546417973717674e-07, + "loss": 0.5393, + "mean_token_accuracy": 0.8285683393478394, + "num_tokens": 26519380.0, + "step": 698 + }, + { + "epoch": 0.08891998473476657, + "ewc_loss": 1.2814998626708984e-06, + "grad_norm": 1.7706694602966309, + "learning_rate": 2.9588808817295464e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8488512635231018, + "num_tokens": 26557306.0, + "step": 699 + }, + { + "epoch": 0.08904719501335707, + "ewc_loss": 1.2814998626708984e-06, + "grad_norm": 1.9799823760986328, + "learning_rate": 2.9631199660873253e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8308514952659607, + "num_tokens": 26593549.0, + "step": 700 + }, + { + "epoch": 0.08917440529194759, + "ewc_loss": 1.296401023864746e-06, + "grad_norm": 2.1211445331573486, + "learning_rate": 2.9673590504451033e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8531254529953003, + "num_tokens": 26629926.0, + "step": 701 + }, + { + "epoch": 0.0893016155705381, + "ewc_loss": 1.296401023864746e-06, + "grad_norm": 1.7520359754562378, + "learning_rate": 2.9715981348028823e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.8441110849380493, + "num_tokens": 26668652.0, + "step": 702 + }, + { + "epoch": 0.0894288258491286, + "ewc_loss": 1.296401023864746e-06, + "grad_norm": 1.9166855812072754, + "learning_rate": 2.975837219160661e-07, + "loss": 0.5521, + "mean_token_accuracy": 0.8239004611968994, + "num_tokens": 26708084.0, + "step": 703 + }, + { + "epoch": 0.08955603612771912, + "ewc_loss": 1.296401023864746e-06, + "grad_norm": 1.723092794418335, + "learning_rate": 2.98007630351844e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.834369957447052, + "num_tokens": 26750134.0, + "step": 704 + }, + { + "epoch": 0.08968324640630963, + "ewc_loss": 1.296401023864746e-06, + "grad_norm": 1.914815068244934, + "learning_rate": 2.984315387876218e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8309703469276428, + "num_tokens": 26785410.0, + "step": 705 + }, + { + "epoch": 0.08981045668490013, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 1.9355136156082153, + "learning_rate": 2.988554472233997e-07, + "loss": 0.5333, + "mean_token_accuracy": 0.8323190212249756, + "num_tokens": 26825226.0, + "step": 706 + }, + { + "epoch": 0.08993766696349065, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.0679991245269775, + "learning_rate": 2.992793556591776e-07, + "loss": 0.5244, + "mean_token_accuracy": 0.832349419593811, + "num_tokens": 26865401.0, + "step": 707 + }, + { + "epoch": 0.09006487724208116, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 1.8594799041748047, + "learning_rate": 2.997032640949555e-07, + "loss": 0.5356, + "mean_token_accuracy": 0.8310428857803345, + "num_tokens": 26905365.0, + "step": 708 + }, + { + "epoch": 0.09019208752067168, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 1.7586232423782349, + "learning_rate": 3.001271725307333e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8452701568603516, + "num_tokens": 26941348.0, + "step": 709 + }, + { + "epoch": 0.09031929779926218, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 2.792175769805908, + "learning_rate": 3.005510809665112e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8364055156707764, + "num_tokens": 26975803.0, + "step": 710 + }, + { + "epoch": 0.09044650807785269, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 1.836457371711731, + "learning_rate": 3.009749894022891e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8524163961410522, + "num_tokens": 27014931.0, + "step": 711 + }, + { + "epoch": 0.0905737183564432, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 1.8483428955078125, + "learning_rate": 3.01398897838067e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8388288617134094, + "num_tokens": 27055505.0, + "step": 712 + }, + { + "epoch": 0.0907009286350337, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 1.8561123609542847, + "learning_rate": 3.018228062738448e-07, + "loss": 0.5416, + "mean_token_accuracy": 0.830817699432373, + "num_tokens": 27092121.0, + "step": 713 + }, + { + "epoch": 0.09082813891362422, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 1.6871851682662964, + "learning_rate": 3.022467147096227e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.846235990524292, + "num_tokens": 27129827.0, + "step": 714 + }, + { + "epoch": 0.09095534919221474, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 1.7322514057159424, + "learning_rate": 3.026706231454006e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.8410321474075317, + "num_tokens": 27174977.0, + "step": 715 + }, + { + "epoch": 0.09108255947080524, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 1.8944381475448608, + "learning_rate": 3.0309453158117844e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8395618200302124, + "num_tokens": 27211472.0, + "step": 716 + }, + { + "epoch": 0.09120976974939575, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 1.9040566682815552, + "learning_rate": 3.035184400169563e-07, + "loss": 0.579, + "mean_token_accuracy": 0.8191559314727783, + "num_tokens": 27249160.0, + "step": 717 + }, + { + "epoch": 0.09133698002798626, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 1.9774909019470215, + "learning_rate": 3.039423484527342e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8413291573524475, + "num_tokens": 27287946.0, + "step": 718 + }, + { + "epoch": 0.09146419030657676, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 1.884929895401001, + "learning_rate": 3.043662568885121e-07, + "loss": 0.5237, + "mean_token_accuracy": 0.8290523290634155, + "num_tokens": 27323388.0, + "step": 719 + }, + { + "epoch": 0.09159140058516728, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 2.096980094909668, + "learning_rate": 3.0479016532428993e-07, + "loss": 0.5732, + "mean_token_accuracy": 0.8171455264091492, + "num_tokens": 27357524.0, + "step": 720 + }, + { + "epoch": 0.0917186108637578, + "ewc_loss": 1.3262033462524414e-06, + "grad_norm": 2.037458896636963, + "learning_rate": 3.052140737600678e-07, + "loss": 0.4823, + "mean_token_accuracy": 0.8430379629135132, + "num_tokens": 27392037.0, + "step": 721 + }, + { + "epoch": 0.09184582114234831, + "ewc_loss": 1.3262033462524414e-06, + "grad_norm": 1.8589602708816528, + "learning_rate": 3.056379821958457e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8420130014419556, + "num_tokens": 27429149.0, + "step": 722 + }, + { + "epoch": 0.09197303142093881, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 1.9244297742843628, + "learning_rate": 3.060618906316236e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8420851230621338, + "num_tokens": 27465023.0, + "step": 723 + }, + { + "epoch": 0.09210024169952932, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 1.9507546424865723, + "learning_rate": 3.064857990674014e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.8425050973892212, + "num_tokens": 27501268.0, + "step": 724 + }, + { + "epoch": 0.09222745197811984, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 1.7644528150558472, + "learning_rate": 3.0690970750317927e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8290501832962036, + "num_tokens": 27539937.0, + "step": 725 + }, + { + "epoch": 0.09235466225671034, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 1.9202204942703247, + "learning_rate": 3.0733361593895717e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8385527729988098, + "num_tokens": 27574576.0, + "step": 726 + }, + { + "epoch": 0.09248187253530085, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 1.7981082201004028, + "learning_rate": 3.0775752437473507e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8320008516311646, + "num_tokens": 27616036.0, + "step": 727 + }, + { + "epoch": 0.09260908281389137, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 1.7705165147781372, + "learning_rate": 3.081814328105129e-07, + "loss": 0.5337, + "mean_token_accuracy": 0.8293706178665161, + "num_tokens": 27662040.0, + "step": 728 + }, + { + "epoch": 0.09273629309248187, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 1.926770806312561, + "learning_rate": 3.0860534124629076e-07, + "loss": 0.5638, + "mean_token_accuracy": 0.8203518390655518, + "num_tokens": 27705136.0, + "step": 729 + }, + { + "epoch": 0.09286350337107238, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 1.928368091583252, + "learning_rate": 3.0902924968206866e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.837257981300354, + "num_tokens": 27740564.0, + "step": 730 + }, + { + "epoch": 0.0929907136496629, + "ewc_loss": 1.341104507446289e-06, + "grad_norm": 2.2354981899261475, + "learning_rate": 3.0945315811784656e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8419719338417053, + "num_tokens": 27774788.0, + "step": 731 + }, + { + "epoch": 0.0931179239282534, + "ewc_loss": 1.341104507446289e-06, + "grad_norm": 1.8022572994232178, + "learning_rate": 3.098770665536244e-07, + "loss": 0.6307, + "mean_token_accuracy": 0.7992240190505981, + "num_tokens": 27818300.0, + "step": 732 + }, + { + "epoch": 0.09324513420684391, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 1.9535868167877197, + "learning_rate": 3.1030097498940225e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.846472978591919, + "num_tokens": 27857739.0, + "step": 733 + }, + { + "epoch": 0.09337234448543442, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 1.8677399158477783, + "learning_rate": 3.1072488342518015e-07, + "loss": 0.5111, + "mean_token_accuracy": 0.8309190273284912, + "num_tokens": 27893897.0, + "step": 734 + }, + { + "epoch": 0.09349955476402494, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 1.917673110961914, + "learning_rate": 3.11148791860958e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8373634815216064, + "num_tokens": 27930511.0, + "step": 735 + }, + { + "epoch": 0.09362676504261544, + "ewc_loss": 1.3560056686401367e-06, + "grad_norm": 1.9821255207061768, + "learning_rate": 3.115727002967359e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8347612023353577, + "num_tokens": 27961128.0, + "step": 736 + }, + { + "epoch": 0.09375397532120595, + "ewc_loss": 1.3560056686401367e-06, + "grad_norm": 1.7760430574417114, + "learning_rate": 3.1199660873251374e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.835014283657074, + "num_tokens": 28000748.0, + "step": 737 + }, + { + "epoch": 0.09388118559979647, + "ewc_loss": 1.3634562492370605e-06, + "grad_norm": 1.6867899894714355, + "learning_rate": 3.1242051716829164e-07, + "loss": 0.5525, + "mean_token_accuracy": 0.8226405382156372, + "num_tokens": 28046455.0, + "step": 738 + }, + { + "epoch": 0.09400839587838697, + "ewc_loss": 1.3634562492370605e-06, + "grad_norm": 1.8316184282302856, + "learning_rate": 3.128444256040695e-07, + "loss": 0.5268, + "mean_token_accuracy": 0.8325960636138916, + "num_tokens": 28087026.0, + "step": 739 + }, + { + "epoch": 0.09413560615697748, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 1.7050586938858032, + "learning_rate": 3.132683340398474e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.851534903049469, + "num_tokens": 28126382.0, + "step": 740 + }, + { + "epoch": 0.094262816435568, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 1.7983804941177368, + "learning_rate": 3.1369224247562523e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8474243879318237, + "num_tokens": 28161198.0, + "step": 741 + }, + { + "epoch": 0.0943900267141585, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 1.6498562097549438, + "learning_rate": 3.1411615091140313e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8560217022895813, + "num_tokens": 28204612.0, + "step": 742 + }, + { + "epoch": 0.09451723699274901, + "ewc_loss": 1.385807991027832e-06, + "grad_norm": 1.7459558248519897, + "learning_rate": 3.14540059347181e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8391367197036743, + "num_tokens": 28241050.0, + "step": 743 + }, + { + "epoch": 0.09464444727133953, + "ewc_loss": 1.385807991027832e-06, + "grad_norm": 1.759491205215454, + "learning_rate": 3.149639677829589e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8501918315887451, + "num_tokens": 28278032.0, + "step": 744 + }, + { + "epoch": 0.09477165754993004, + "ewc_loss": 1.385807991027832e-06, + "grad_norm": 1.8082668781280518, + "learning_rate": 3.153878762187368e-07, + "loss": 0.5491, + "mean_token_accuracy": 0.8299806714057922, + "num_tokens": 28315599.0, + "step": 745 + }, + { + "epoch": 0.09489886782852054, + "ewc_loss": 1.3932585716247559e-06, + "grad_norm": 1.768078327178955, + "learning_rate": 3.158117846545146e-07, + "loss": 0.5732, + "mean_token_accuracy": 0.819845974445343, + "num_tokens": 28358504.0, + "step": 746 + }, + { + "epoch": 0.09502607810711106, + "ewc_loss": 1.3932585716247559e-06, + "grad_norm": 2.052300453186035, + "learning_rate": 3.1623569309029247e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.842929482460022, + "num_tokens": 28389626.0, + "step": 747 + }, + { + "epoch": 0.09515328838570157, + "ewc_loss": 1.3932585716247559e-06, + "grad_norm": 1.7359870672225952, + "learning_rate": 3.1665960152607037e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8463891744613647, + "num_tokens": 28430361.0, + "step": 748 + }, + { + "epoch": 0.09528049866429207, + "ewc_loss": 1.3932585716247559e-06, + "grad_norm": 1.784204363822937, + "learning_rate": 3.1708350996184826e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8356907367706299, + "num_tokens": 28469792.0, + "step": 749 + }, + { + "epoch": 0.09540770894288259, + "ewc_loss": 1.4007091522216797e-06, + "grad_norm": 1.8601093292236328, + "learning_rate": 3.175074183976261e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.8455377817153931, + "num_tokens": 28507569.0, + "step": 750 + }, + { + "epoch": 0.0955349192214731, + "ewc_loss": 1.4007091522216797e-06, + "grad_norm": 1.9234532117843628, + "learning_rate": 3.1793132683340396e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.8370408415794373, + "num_tokens": 28542336.0, + "step": 751 + }, + { + "epoch": 0.0956621295000636, + "ewc_loss": 1.4081597328186035e-06, + "grad_norm": 1.9436486959457397, + "learning_rate": 3.1835523526918186e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.850622296333313, + "num_tokens": 28578587.0, + "step": 752 + }, + { + "epoch": 0.09578933977865411, + "ewc_loss": 1.4081597328186035e-06, + "grad_norm": 1.8875097036361694, + "learning_rate": 3.1877914370495975e-07, + "loss": 0.5432, + "mean_token_accuracy": 0.8275654315948486, + "num_tokens": 28617505.0, + "step": 753 + }, + { + "epoch": 0.09591655005724463, + "ewc_loss": 1.4081597328186035e-06, + "grad_norm": 1.7720563411712646, + "learning_rate": 3.1920305214073755e-07, + "loss": 0.5466, + "mean_token_accuracy": 0.8265494108200073, + "num_tokens": 28658526.0, + "step": 754 + }, + { + "epoch": 0.09604376033583513, + "ewc_loss": 1.4081597328186035e-06, + "grad_norm": 1.7827844619750977, + "learning_rate": 3.1962696057651545e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8440294861793518, + "num_tokens": 28698825.0, + "step": 755 + }, + { + "epoch": 0.09617097061442564, + "ewc_loss": 1.4081597328186035e-06, + "grad_norm": 1.9318188428878784, + "learning_rate": 3.2005086901229335e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.8292071223258972, + "num_tokens": 28733238.0, + "step": 756 + }, + { + "epoch": 0.09629818089301616, + "ewc_loss": 1.4156103134155273e-06, + "grad_norm": 1.754564881324768, + "learning_rate": 3.2047477744807125e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.8340094089508057, + "num_tokens": 28771552.0, + "step": 757 + }, + { + "epoch": 0.09642539117160667, + "ewc_loss": 1.4156103134155273e-06, + "grad_norm": 1.8893649578094482, + "learning_rate": 3.2089868588384904e-07, + "loss": 0.5563, + "mean_token_accuracy": 0.8200599551200867, + "num_tokens": 28810827.0, + "step": 758 + }, + { + "epoch": 0.09655260145019717, + "ewc_loss": 1.4156103134155273e-06, + "grad_norm": 1.8136416673660278, + "learning_rate": 3.2132259431962694e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8478595614433289, + "num_tokens": 28849017.0, + "step": 759 + }, + { + "epoch": 0.09667981172878769, + "ewc_loss": 1.4156103134155273e-06, + "grad_norm": 1.7477450370788574, + "learning_rate": 3.2174650275540484e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8386064767837524, + "num_tokens": 28887659.0, + "step": 760 + }, + { + "epoch": 0.0968070220073782, + "ewc_loss": 1.4230608940124512e-06, + "grad_norm": 2.6262199878692627, + "learning_rate": 3.2217041119118274e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8431762456893921, + "num_tokens": 28928983.0, + "step": 761 + }, + { + "epoch": 0.0969342322859687, + "ewc_loss": 1.4230608940124512e-06, + "grad_norm": 1.8232108354568481, + "learning_rate": 3.2259431962696053e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.8351995944976807, + "num_tokens": 28962195.0, + "step": 762 + }, + { + "epoch": 0.09706144256455922, + "ewc_loss": 1.4230608940124512e-06, + "grad_norm": 1.9398330450057983, + "learning_rate": 3.2301822806273843e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.8376710414886475, + "num_tokens": 28997351.0, + "step": 763 + }, + { + "epoch": 0.09718865284314973, + "ewc_loss": 1.4230608940124512e-06, + "grad_norm": 1.7344070672988892, + "learning_rate": 3.2344213649851633e-07, + "loss": 0.5282, + "mean_token_accuracy": 0.8311787843704224, + "num_tokens": 29040988.0, + "step": 764 + }, + { + "epoch": 0.09731586312174023, + "ewc_loss": 1.4230608940124512e-06, + "grad_norm": 1.7211179733276367, + "learning_rate": 3.238660449342942e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.8400012850761414, + "num_tokens": 29081873.0, + "step": 765 + }, + { + "epoch": 0.09744307340033075, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.8870298862457275, + "learning_rate": 3.24289953370072e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.845658004283905, + "num_tokens": 29115360.0, + "step": 766 + }, + { + "epoch": 0.09757028367892126, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.6558979749679565, + "learning_rate": 3.247138618058499e-07, + "loss": 0.5094, + "mean_token_accuracy": 0.831852912902832, + "num_tokens": 29160139.0, + "step": 767 + }, + { + "epoch": 0.09769749395751176, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.653857946395874, + "learning_rate": 3.251377702416278e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.830254852771759, + "num_tokens": 29202872.0, + "step": 768 + }, + { + "epoch": 0.09782470423610228, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.6661964654922485, + "learning_rate": 3.255616786774057e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8434403538703918, + "num_tokens": 29246965.0, + "step": 769 + }, + { + "epoch": 0.09795191451469279, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.6343187093734741, + "learning_rate": 3.259855871131835e-07, + "loss": 0.4262, + "mean_token_accuracy": 0.8612722158432007, + "num_tokens": 29289531.0, + "step": 770 + }, + { + "epoch": 0.0980791247932833, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.7142417430877686, + "learning_rate": 3.264094955489614e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8446612358093262, + "num_tokens": 29330484.0, + "step": 771 + }, + { + "epoch": 0.0982063350718738, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.6727149486541748, + "learning_rate": 3.268334039847393e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8364452123641968, + "num_tokens": 29369595.0, + "step": 772 + }, + { + "epoch": 0.09833354535046432, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.685147762298584, + "learning_rate": 3.2725731242051715e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8439481258392334, + "num_tokens": 29411856.0, + "step": 773 + }, + { + "epoch": 0.09846075562905483, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.6623623371124268, + "learning_rate": 3.27681220856295e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8470527529716492, + "num_tokens": 29454754.0, + "step": 774 + }, + { + "epoch": 0.09858796590764533, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 1.910772442817688, + "learning_rate": 3.281051292920729e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8447140455245972, + "num_tokens": 29489131.0, + "step": 775 + }, + { + "epoch": 0.09871517618623585, + "ewc_loss": 1.4454126358032227e-06, + "grad_norm": 1.7329223155975342, + "learning_rate": 3.285290377278508e-07, + "loss": 0.5293, + "mean_token_accuracy": 0.8296821117401123, + "num_tokens": 29528292.0, + "step": 776 + }, + { + "epoch": 0.09884238646482636, + "ewc_loss": 1.4454126358032227e-06, + "grad_norm": 1.9057796001434326, + "learning_rate": 3.2895294616362864e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.852145791053772, + "num_tokens": 29564878.0, + "step": 777 + }, + { + "epoch": 0.09896959674341686, + "ewc_loss": 1.4454126358032227e-06, + "grad_norm": 1.7923541069030762, + "learning_rate": 3.293768545994065e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8419971466064453, + "num_tokens": 29602826.0, + "step": 778 + }, + { + "epoch": 0.09909680702200738, + "ewc_loss": 1.4454126358032227e-06, + "grad_norm": 1.8486448526382446, + "learning_rate": 3.298007630351844e-07, + "loss": 0.5516, + "mean_token_accuracy": 0.8269755244255066, + "num_tokens": 29640323.0, + "step": 779 + }, + { + "epoch": 0.09922401730059789, + "ewc_loss": 1.4454126358032227e-06, + "grad_norm": 1.6184589862823486, + "learning_rate": 3.302246714709623e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8435497283935547, + "num_tokens": 29683302.0, + "step": 780 + }, + { + "epoch": 0.09935122757918839, + "ewc_loss": 1.4528632164001465e-06, + "grad_norm": 1.7117160558700562, + "learning_rate": 3.3064857990674013e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8406156301498413, + "num_tokens": 29721085.0, + "step": 781 + }, + { + "epoch": 0.0994784378577789, + "ewc_loss": 1.4528632164001465e-06, + "grad_norm": 1.8105387687683105, + "learning_rate": 3.31072488342518e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8494068384170532, + "num_tokens": 29753810.0, + "step": 782 + }, + { + "epoch": 0.09960564813636942, + "ewc_loss": 1.4603137969970703e-06, + "grad_norm": 1.678924798965454, + "learning_rate": 3.314963967782959e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8456687927246094, + "num_tokens": 29796488.0, + "step": 783 + }, + { + "epoch": 0.09973285841495994, + "ewc_loss": 1.4603137969970703e-06, + "grad_norm": 1.6805708408355713, + "learning_rate": 3.319203052140738e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.8408367037773132, + "num_tokens": 29838181.0, + "step": 784 + }, + { + "epoch": 0.09986006869355044, + "ewc_loss": 1.4677643775939941e-06, + "grad_norm": 1.9749033451080322, + "learning_rate": 3.323442136498516e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8461019396781921, + "num_tokens": 29868893.0, + "step": 785 + }, + { + "epoch": 0.09998727897214095, + "ewc_loss": 1.4677643775939941e-06, + "grad_norm": 1.8885048627853394, + "learning_rate": 3.3276812208562947e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8395928144454956, + "num_tokens": 29903571.0, + "step": 786 + }, + { + "epoch": 0.10011448925073146, + "ewc_loss": 1.475214958190918e-06, + "grad_norm": 1.8402018547058105, + "learning_rate": 3.3319203052140737e-07, + "loss": 0.5503, + "mean_token_accuracy": 0.8313214182853699, + "num_tokens": 29940438.0, + "step": 787 + }, + { + "epoch": 0.10024169952932196, + "ewc_loss": 1.475214958190918e-06, + "grad_norm": 1.810613989830017, + "learning_rate": 3.336159389571852e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8510515093803406, + "num_tokens": 29976538.0, + "step": 788 + }, + { + "epoch": 0.10036890980791248, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.66619074344635, + "learning_rate": 3.340398473929631e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.8268822431564331, + "num_tokens": 30019814.0, + "step": 789 + }, + { + "epoch": 0.100496120086503, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.8478717803955078, + "learning_rate": 3.3446375582874096e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8424779176712036, + "num_tokens": 30057578.0, + "step": 790 + }, + { + "epoch": 0.1006233303650935, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.8113384246826172, + "learning_rate": 3.3488766426451886e-07, + "loss": 0.5629, + "mean_token_accuracy": 0.8280032873153687, + "num_tokens": 30094869.0, + "step": 791 + }, + { + "epoch": 0.10075054064368401, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.7955771684646606, + "learning_rate": 3.353115727002967e-07, + "loss": 0.5254, + "mean_token_accuracy": 0.8327397704124451, + "num_tokens": 30136446.0, + "step": 792 + }, + { + "epoch": 0.10087775092227452, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 1.665665626525879, + "learning_rate": 3.357354811360746e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8300836086273193, + "num_tokens": 30179226.0, + "step": 793 + }, + { + "epoch": 0.10100496120086502, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.839752435684204, + "learning_rate": 3.3615938957185245e-07, + "loss": 0.5102, + "mean_token_accuracy": 0.8351790904998779, + "num_tokens": 30213890.0, + "step": 794 + }, + { + "epoch": 0.10113217147945554, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.9706668853759766, + "learning_rate": 3.3658329800763035e-07, + "loss": 0.5598, + "mean_token_accuracy": 0.8183390498161316, + "num_tokens": 30245691.0, + "step": 795 + }, + { + "epoch": 0.10125938175804605, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.8247153759002686, + "learning_rate": 3.370072064434082e-07, + "loss": 0.5154, + "mean_token_accuracy": 0.8280184268951416, + "num_tokens": 30281738.0, + "step": 796 + }, + { + "epoch": 0.10138659203663657, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.723746418952942, + "learning_rate": 3.374311148791861e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.843023955821991, + "num_tokens": 30320707.0, + "step": 797 + }, + { + "epoch": 0.10151380231522707, + "ewc_loss": 1.4975666999816895e-06, + "grad_norm": 2.1243934631347656, + "learning_rate": 3.3785502331496394e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8407968282699585, + "num_tokens": 30351379.0, + "step": 798 + }, + { + "epoch": 0.10164101259381758, + "ewc_loss": 1.4975666999816895e-06, + "grad_norm": 2.0355775356292725, + "learning_rate": 3.3827893175074184e-07, + "loss": 0.5121, + "mean_token_accuracy": 0.8342612981796265, + "num_tokens": 30382395.0, + "step": 799 + }, + { + "epoch": 0.1017682228724081, + "ewc_loss": 1.5050172805786133e-06, + "grad_norm": 1.886910080909729, + "learning_rate": 3.387028401865197e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8378203511238098, + "num_tokens": 30419758.0, + "step": 800 + }, + { + "epoch": 0.1018954331509986, + "ewc_loss": 1.5050172805786133e-06, + "grad_norm": 1.787421703338623, + "learning_rate": 3.391267486222976e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.8358287811279297, + "num_tokens": 30458115.0, + "step": 801 + }, + { + "epoch": 0.10202264342958911, + "ewc_loss": 1.5050172805786133e-06, + "grad_norm": 2.016655206680298, + "learning_rate": 3.3955065705807543e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8257460594177246, + "num_tokens": 30489679.0, + "step": 802 + }, + { + "epoch": 0.10214985370817962, + "ewc_loss": 1.5124678611755371e-06, + "grad_norm": 1.8460407257080078, + "learning_rate": 3.3997456549385333e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8482562303543091, + "num_tokens": 30524874.0, + "step": 803 + }, + { + "epoch": 0.10227706398677013, + "ewc_loss": 1.519918441772461e-06, + "grad_norm": 1.9980409145355225, + "learning_rate": 3.403984739296312e-07, + "loss": 0.546, + "mean_token_accuracy": 0.8256720304489136, + "num_tokens": 30558354.0, + "step": 804 + }, + { + "epoch": 0.10240427426536064, + "ewc_loss": 1.519918441772461e-06, + "grad_norm": 1.8562655448913574, + "learning_rate": 3.408223823654091e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8457245826721191, + "num_tokens": 30594827.0, + "step": 805 + }, + { + "epoch": 0.10253148454395115, + "ewc_loss": 1.519918441772461e-06, + "grad_norm": 1.8281179666519165, + "learning_rate": 3.412462908011869e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.8389769792556763, + "num_tokens": 30635259.0, + "step": 806 + }, + { + "epoch": 0.10265869482254165, + "ewc_loss": 1.5273690223693848e-06, + "grad_norm": 1.7792326211929321, + "learning_rate": 3.4167019923696477e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8411235213279724, + "num_tokens": 30681850.0, + "step": 807 + }, + { + "epoch": 0.10278590510113217, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 1.743859052658081, + "learning_rate": 3.4209410767274267e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8492101430892944, + "num_tokens": 30721302.0, + "step": 808 + }, + { + "epoch": 0.10291311537972268, + "ewc_loss": 1.519918441772461e-06, + "grad_norm": 1.8621572256088257, + "learning_rate": 3.4251801610852057e-07, + "loss": 0.5533, + "mean_token_accuracy": 0.8213154673576355, + "num_tokens": 30761975.0, + "step": 809 + }, + { + "epoch": 0.1030403256583132, + "ewc_loss": 1.5273690223693848e-06, + "grad_norm": 1.8922884464263916, + "learning_rate": 3.429419245442984e-07, + "loss": 0.5043, + "mean_token_accuracy": 0.8371376395225525, + "num_tokens": 30797754.0, + "step": 810 + }, + { + "epoch": 0.1031675359369037, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 3.0764453411102295, + "learning_rate": 3.4336583298007626e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.846156120300293, + "num_tokens": 30834183.0, + "step": 811 + }, + { + "epoch": 0.10329474621549421, + "ewc_loss": 1.5422701835632324e-06, + "grad_norm": 1.8142356872558594, + "learning_rate": 3.4378974141585416e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.8373573422431946, + "num_tokens": 30874160.0, + "step": 812 + }, + { + "epoch": 0.10342195649408473, + "ewc_loss": 1.5422701835632324e-06, + "grad_norm": 1.5663982629776, + "learning_rate": 3.4421364985163206e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8545858263969421, + "num_tokens": 30916827.0, + "step": 813 + }, + { + "epoch": 0.10354916677267523, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 1.664846658706665, + "learning_rate": 3.446375582874099e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8366163969039917, + "num_tokens": 30960063.0, + "step": 814 + }, + { + "epoch": 0.10367637705126574, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 1.9428728818893433, + "learning_rate": 3.4506146672318775e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8480278253555298, + "num_tokens": 30995172.0, + "step": 815 + }, + { + "epoch": 0.10380358732985626, + "ewc_loss": 1.5497207641601562e-06, + "grad_norm": 1.8103420734405518, + "learning_rate": 3.4548537515896565e-07, + "loss": 0.5218, + "mean_token_accuracy": 0.8331249952316284, + "num_tokens": 31035089.0, + "step": 816 + }, + { + "epoch": 0.10393079760844676, + "ewc_loss": 1.5497207641601562e-06, + "grad_norm": 1.6759246587753296, + "learning_rate": 3.4590928359474355e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.835224986076355, + "num_tokens": 31074836.0, + "step": 817 + }, + { + "epoch": 0.10405800788703727, + "ewc_loss": 1.5497207641601562e-06, + "grad_norm": 1.683217167854309, + "learning_rate": 3.463331920305214e-07, + "loss": 0.5255, + "mean_token_accuracy": 0.8325908184051514, + "num_tokens": 31114700.0, + "step": 818 + }, + { + "epoch": 0.10418521816562779, + "ewc_loss": 1.5497207641601562e-06, + "grad_norm": 1.9422636032104492, + "learning_rate": 3.4675710046629924e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8466897010803223, + "num_tokens": 31148258.0, + "step": 819 + }, + { + "epoch": 0.1043124284442183, + "ewc_loss": 1.5497207641601562e-06, + "grad_norm": 1.7273489236831665, + "learning_rate": 3.4718100890207714e-07, + "loss": 0.512, + "mean_token_accuracy": 0.837662398815155, + "num_tokens": 31185720.0, + "step": 820 + }, + { + "epoch": 0.1044396387228088, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.8579996824264526, + "learning_rate": 3.4760491733785504e-07, + "loss": 0.5216, + "mean_token_accuracy": 0.8299850821495056, + "num_tokens": 31220353.0, + "step": 821 + }, + { + "epoch": 0.10456684900139931, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.7389148473739624, + "learning_rate": 3.480288257736329e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.8361079692840576, + "num_tokens": 31256477.0, + "step": 822 + }, + { + "epoch": 0.10469405927998983, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.8706616163253784, + "learning_rate": 3.4845273420941073e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.840815544128418, + "num_tokens": 31294365.0, + "step": 823 + }, + { + "epoch": 0.10482126955858033, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 2.000624656677246, + "learning_rate": 3.4887664264518863e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8377583026885986, + "num_tokens": 31329357.0, + "step": 824 + }, + { + "epoch": 0.10494847983717084, + "ewc_loss": 1.55717134475708e-06, + "grad_norm": 1.7025156021118164, + "learning_rate": 3.4930055108096653e-07, + "loss": 0.5014, + "mean_token_accuracy": 0.8396602272987366, + "num_tokens": 31372180.0, + "step": 825 + }, + { + "epoch": 0.10507569011576136, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.752610445022583, + "learning_rate": 3.497244595167443e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8431713581085205, + "num_tokens": 31414866.0, + "step": 826 + }, + { + "epoch": 0.10520290039435186, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.921778678894043, + "learning_rate": 3.501483679525222e-07, + "loss": 0.5503, + "mean_token_accuracy": 0.8240472078323364, + "num_tokens": 31450583.0, + "step": 827 + }, + { + "epoch": 0.10533011067294237, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.7723910808563232, + "learning_rate": 3.505722763883001e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8430836200714111, + "num_tokens": 31490371.0, + "step": 828 + }, + { + "epoch": 0.10545732095153289, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.8744566440582275, + "learning_rate": 3.50996184824078e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.831264853477478, + "num_tokens": 31525951.0, + "step": 829 + }, + { + "epoch": 0.10558453123012339, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.7183061838150024, + "learning_rate": 3.514200932598558e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.8211350440979004, + "num_tokens": 31570109.0, + "step": 830 + }, + { + "epoch": 0.1057117415087139, + "ewc_loss": 1.55717134475708e-06, + "grad_norm": 1.826084852218628, + "learning_rate": 3.518440016956337e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8370299339294434, + "num_tokens": 31610569.0, + "step": 831 + }, + { + "epoch": 0.10583895178730442, + "ewc_loss": 1.5795230865478516e-06, + "grad_norm": 1.7351741790771484, + "learning_rate": 3.522679101314116e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8369965553283691, + "num_tokens": 31653585.0, + "step": 832 + }, + { + "epoch": 0.10596616206589493, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.7231727838516235, + "learning_rate": 3.526918185671895e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.83136385679245, + "num_tokens": 31696353.0, + "step": 833 + }, + { + "epoch": 0.10609337234448543, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.880366563796997, + "learning_rate": 3.531157270029673e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8515182137489319, + "num_tokens": 31730620.0, + "step": 834 + }, + { + "epoch": 0.10622058262307595, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.7738516330718994, + "learning_rate": 3.535396354387452e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8458925485610962, + "num_tokens": 31770538.0, + "step": 835 + }, + { + "epoch": 0.10634779290166646, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.9106494188308716, + "learning_rate": 3.539635438745231e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8333494663238525, + "num_tokens": 31808687.0, + "step": 836 + }, + { + "epoch": 0.10647500318025696, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.6119384765625, + "learning_rate": 3.54387452310301e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8404892086982727, + "num_tokens": 31852310.0, + "step": 837 + }, + { + "epoch": 0.10660221345884748, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.6784600019454956, + "learning_rate": 3.548113607460788e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8526695966720581, + "num_tokens": 31887897.0, + "step": 838 + }, + { + "epoch": 0.10672942373743799, + "ewc_loss": 1.5795230865478516e-06, + "grad_norm": 1.8344383239746094, + "learning_rate": 3.552352691818567e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.8437883853912354, + "num_tokens": 31926159.0, + "step": 839 + }, + { + "epoch": 0.10685663401602849, + "ewc_loss": 1.5869736671447754e-06, + "grad_norm": 1.6316026449203491, + "learning_rate": 3.556591776176346e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.850334882736206, + "num_tokens": 31968432.0, + "step": 840 + }, + { + "epoch": 0.106983844294619, + "ewc_loss": 1.5869736671447754e-06, + "grad_norm": 1.9738166332244873, + "learning_rate": 3.560830860534125e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8436077833175659, + "num_tokens": 32000949.0, + "step": 841 + }, + { + "epoch": 0.10711105457320952, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.9388290643692017, + "learning_rate": 3.565069944891903e-07, + "loss": 0.5389, + "mean_token_accuracy": 0.8268791437149048, + "num_tokens": 32033862.0, + "step": 842 + }, + { + "epoch": 0.10723826485180002, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.8760308027267456, + "learning_rate": 3.569309029249682e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8403668403625488, + "num_tokens": 32068623.0, + "step": 843 + }, + { + "epoch": 0.10736547513039053, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.7792818546295166, + "learning_rate": 3.573548113607461e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.8434057235717773, + "num_tokens": 32106658.0, + "step": 844 + }, + { + "epoch": 0.10749268540898105, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.9535773992538452, + "learning_rate": 3.577787197965239e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.829407274723053, + "num_tokens": 32142031.0, + "step": 845 + }, + { + "epoch": 0.10761989568757156, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 1.7463716268539429, + "learning_rate": 3.5820262823230177e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8549165725708008, + "num_tokens": 32182686.0, + "step": 846 + }, + { + "epoch": 0.10774710596616206, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 2.166370391845703, + "learning_rate": 3.5862653666807967e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8203912377357483, + "num_tokens": 32218027.0, + "step": 847 + }, + { + "epoch": 0.10787431624475258, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.796443223953247, + "learning_rate": 3.5905044510385757e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.838213324546814, + "num_tokens": 32257854.0, + "step": 848 + }, + { + "epoch": 0.10800152652334309, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.769997000694275, + "learning_rate": 3.594743535396354e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8406716585159302, + "num_tokens": 32295019.0, + "step": 849 + }, + { + "epoch": 0.10812873680193359, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.7756224870681763, + "learning_rate": 3.5989826197541326e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8257572650909424, + "num_tokens": 32335652.0, + "step": 850 + }, + { + "epoch": 0.1082559470805241, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.6816489696502686, + "learning_rate": 3.6032217041119116e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.8372407555580139, + "num_tokens": 32377249.0, + "step": 851 + }, + { + "epoch": 0.10838315735911462, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.5916767120361328, + "learning_rate": 3.6074607884696906e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8513331413269043, + "num_tokens": 32422467.0, + "step": 852 + }, + { + "epoch": 0.10851036763770512, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 2.6081082820892334, + "learning_rate": 3.611699872827469e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8516018390655518, + "num_tokens": 32462120.0, + "step": 853 + }, + { + "epoch": 0.10863757791629564, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.962159276008606, + "learning_rate": 3.6159389571852475e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.818821907043457, + "num_tokens": 32498484.0, + "step": 854 + }, + { + "epoch": 0.10876478819488615, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.7038028240203857, + "learning_rate": 3.6201780415430265e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8356809616088867, + "num_tokens": 32541536.0, + "step": 855 + }, + { + "epoch": 0.10889199847347665, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 1.7290652990341187, + "learning_rate": 3.6244171259008055e-07, + "loss": 0.533, + "mean_token_accuracy": 0.8326082229614258, + "num_tokens": 32579750.0, + "step": 856 + }, + { + "epoch": 0.10901920875206716, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.7097688913345337, + "learning_rate": 3.628656210258584e-07, + "loss": 0.5433, + "mean_token_accuracy": 0.8245148658752441, + "num_tokens": 32617518.0, + "step": 857 + }, + { + "epoch": 0.10914641903065768, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.6647124290466309, + "learning_rate": 3.6328952946163624e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8445902466773987, + "num_tokens": 32653931.0, + "step": 858 + }, + { + "epoch": 0.1092736293092482, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.7101322412490845, + "learning_rate": 3.6371343789741414e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8475877046585083, + "num_tokens": 32693797.0, + "step": 859 + }, + { + "epoch": 0.1094008395878387, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.8781658411026, + "learning_rate": 3.6413734633319204e-07, + "loss": 0.519, + "mean_token_accuracy": 0.8350810408592224, + "num_tokens": 32732432.0, + "step": 860 + }, + { + "epoch": 0.10952804986642921, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.7732837200164795, + "learning_rate": 3.645612547689699e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8420852422714233, + "num_tokens": 32769225.0, + "step": 861 + }, + { + "epoch": 0.10965526014501972, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.6489248275756836, + "learning_rate": 3.6498516320474773e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8514572978019714, + "num_tokens": 32812027.0, + "step": 862 + }, + { + "epoch": 0.10978247042361022, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 1.7258760929107666, + "learning_rate": 3.6540907164052563e-07, + "loss": 0.4461, + "mean_token_accuracy": 0.8558658361434937, + "num_tokens": 32849334.0, + "step": 863 + }, + { + "epoch": 0.10990968070220074, + "ewc_loss": 1.6167759895324707e-06, + "grad_norm": 1.9358034133911133, + "learning_rate": 3.658329800763035e-07, + "loss": 0.5399, + "mean_token_accuracy": 0.825797438621521, + "num_tokens": 32888422.0, + "step": 864 + }, + { + "epoch": 0.11003689098079125, + "ewc_loss": 1.6242265701293945e-06, + "grad_norm": 1.9682691097259521, + "learning_rate": 3.662568885120814e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8388712406158447, + "num_tokens": 32922672.0, + "step": 865 + }, + { + "epoch": 0.11016410125938175, + "ewc_loss": 1.6242265701293945e-06, + "grad_norm": 1.7467358112335205, + "learning_rate": 3.666807969478592e-07, + "loss": 0.5311, + "mean_token_accuracy": 0.8276327848434448, + "num_tokens": 32961694.0, + "step": 866 + }, + { + "epoch": 0.11029131153797227, + "ewc_loss": 1.6316771507263184e-06, + "grad_norm": 1.865481972694397, + "learning_rate": 3.671047053836371e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8318371176719666, + "num_tokens": 32998595.0, + "step": 867 + }, + { + "epoch": 0.11041852181656278, + "ewc_loss": 1.6391277313232422e-06, + "grad_norm": 1.6863635778427124, + "learning_rate": 3.6752861381941497e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8432952761650085, + "num_tokens": 33035884.0, + "step": 868 + }, + { + "epoch": 0.11054573209515328, + "ewc_loss": 1.6391277313232422e-06, + "grad_norm": 1.795446515083313, + "learning_rate": 3.6795252225519287e-07, + "loss": 0.5399, + "mean_token_accuracy": 0.8303908109664917, + "num_tokens": 33073335.0, + "step": 869 + }, + { + "epoch": 0.1106729423737438, + "ewc_loss": 1.6391277313232422e-06, + "grad_norm": 1.6809781789779663, + "learning_rate": 3.6837643069097077e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8553808927536011, + "num_tokens": 33111443.0, + "step": 870 + }, + { + "epoch": 0.11080015265233431, + "ewc_loss": 1.6391277313232422e-06, + "grad_norm": 1.61839759349823, + "learning_rate": 3.688003391267486e-07, + "loss": 0.501, + "mean_token_accuracy": 0.8424859046936035, + "num_tokens": 33156362.0, + "step": 871 + }, + { + "epoch": 0.11092736293092482, + "ewc_loss": 1.6540288925170898e-06, + "grad_norm": 1.6339542865753174, + "learning_rate": 3.6922424756252646e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8483197093009949, + "num_tokens": 33204968.0, + "step": 872 + }, + { + "epoch": 0.11105457320951533, + "ewc_loss": 1.6614794731140137e-06, + "grad_norm": 1.9690779447555542, + "learning_rate": 3.6964815599830436e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8319567441940308, + "num_tokens": 33239265.0, + "step": 873 + }, + { + "epoch": 0.11118178348810584, + "ewc_loss": 1.6614794731140137e-06, + "grad_norm": 1.8438694477081299, + "learning_rate": 3.7007206443408226e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8543068170547485, + "num_tokens": 33275897.0, + "step": 874 + }, + { + "epoch": 0.11130899376669635, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.7996151447296143, + "learning_rate": 3.704959728698601e-07, + "loss": 0.5708, + "mean_token_accuracy": 0.8177343606948853, + "num_tokens": 33313966.0, + "step": 875 + }, + { + "epoch": 0.11143620404528685, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.8389177322387695, + "learning_rate": 3.7091988130563795e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8358361721038818, + "num_tokens": 33352909.0, + "step": 876 + }, + { + "epoch": 0.11156341432387737, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.57749605178833, + "learning_rate": 3.7134378974141585e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.8407585620880127, + "num_tokens": 33400120.0, + "step": 877 + }, + { + "epoch": 0.11169062460246788, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.8098132610321045, + "learning_rate": 3.7176769817719375e-07, + "loss": 0.5269, + "mean_token_accuracy": 0.8316101431846619, + "num_tokens": 33436703.0, + "step": 878 + }, + { + "epoch": 0.11181783488105838, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.8427776098251343, + "learning_rate": 3.7219160661297154e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8591610789299011, + "num_tokens": 33473515.0, + "step": 879 + }, + { + "epoch": 0.1119450451596489, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.7591887712478638, + "learning_rate": 3.7261551504874944e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8425774574279785, + "num_tokens": 33507659.0, + "step": 880 + }, + { + "epoch": 0.11207225543823941, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.724513053894043, + "learning_rate": 3.7303942348452734e-07, + "loss": 0.4451, + "mean_token_accuracy": 0.854404628276825, + "num_tokens": 33542430.0, + "step": 881 + }, + { + "epoch": 0.11219946571682991, + "ewc_loss": 1.691281795501709e-06, + "grad_norm": 1.698539137840271, + "learning_rate": 3.7346333192030524e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8568885326385498, + "num_tokens": 33582579.0, + "step": 882 + }, + { + "epoch": 0.11232667599542043, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.8714244365692139, + "learning_rate": 3.7388724035608303e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8489245176315308, + "num_tokens": 33625928.0, + "step": 883 + }, + { + "epoch": 0.11245388627401094, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.7944854497909546, + "learning_rate": 3.7431114879186093e-07, + "loss": 0.5168, + "mean_token_accuracy": 0.835213840007782, + "num_tokens": 33661325.0, + "step": 884 + }, + { + "epoch": 0.11258109655260146, + "ewc_loss": 1.6838312149047852e-06, + "grad_norm": 1.766153335571289, + "learning_rate": 3.7473505722763883e-07, + "loss": 0.5385, + "mean_token_accuracy": 0.8281770348548889, + "num_tokens": 33696472.0, + "step": 885 + }, + { + "epoch": 0.11270830683119196, + "ewc_loss": 1.691281795501709e-06, + "grad_norm": 1.743361473083496, + "learning_rate": 3.7515896566341673e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.839305579662323, + "num_tokens": 33732637.0, + "step": 886 + }, + { + "epoch": 0.11283551710978247, + "ewc_loss": 1.6838312149047852e-06, + "grad_norm": 1.7140933275222778, + "learning_rate": 3.755828740991945e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8345683217048645, + "num_tokens": 33775106.0, + "step": 887 + }, + { + "epoch": 0.11296272738837299, + "ewc_loss": 1.691281795501709e-06, + "grad_norm": 1.8029597997665405, + "learning_rate": 3.760067825349724e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8469479084014893, + "num_tokens": 33811297.0, + "step": 888 + }, + { + "epoch": 0.11308993766696349, + "ewc_loss": 1.691281795501709e-06, + "grad_norm": 1.5305684804916382, + "learning_rate": 3.764306909707503e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8564467430114746, + "num_tokens": 33854038.0, + "step": 889 + }, + { + "epoch": 0.113217147945554, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.8776952028274536, + "learning_rate": 3.768545994065282e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8391004800796509, + "num_tokens": 33884929.0, + "step": 890 + }, + { + "epoch": 0.11334435822414451, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.7698768377304077, + "learning_rate": 3.77278507842306e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8382035493850708, + "num_tokens": 33926065.0, + "step": 891 + }, + { + "epoch": 0.11347156850273502, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.743647813796997, + "learning_rate": 3.777024162780839e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8487738370895386, + "num_tokens": 33965557.0, + "step": 892 + }, + { + "epoch": 0.11359877878132553, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.7529850006103516, + "learning_rate": 3.781263247138618e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8483991026878357, + "num_tokens": 34004132.0, + "step": 893 + }, + { + "epoch": 0.11372598905991604, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.7944726943969727, + "learning_rate": 3.785502331496397e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8345142602920532, + "num_tokens": 34039781.0, + "step": 894 + }, + { + "epoch": 0.11385319933850654, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 1.770765781402588, + "learning_rate": 3.789741415854175e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8341211080551147, + "num_tokens": 34080487.0, + "step": 895 + }, + { + "epoch": 0.11398040961709706, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 1.7513781785964966, + "learning_rate": 3.793980500211954e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8399299383163452, + "num_tokens": 34119913.0, + "step": 896 + }, + { + "epoch": 0.11410761989568757, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.6708632707595825, + "learning_rate": 3.798219584569733e-07, + "loss": 0.4314, + "mean_token_accuracy": 0.8595619201660156, + "num_tokens": 34155811.0, + "step": 897 + }, + { + "epoch": 0.11423483017427809, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.8512439727783203, + "learning_rate": 3.8024586689275115e-07, + "loss": 0.5518, + "mean_token_accuracy": 0.8282003998756409, + "num_tokens": 34195022.0, + "step": 898 + }, + { + "epoch": 0.11436204045286859, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 1.724054217338562, + "learning_rate": 3.80669775328529e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8423196077346802, + "num_tokens": 34229182.0, + "step": 899 + }, + { + "epoch": 0.1144892507314591, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 1.7412058115005493, + "learning_rate": 3.810936837643069e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8532505631446838, + "num_tokens": 34266931.0, + "step": 900 + }, + { + "epoch": 0.11461646101004962, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 2.6075704097747803, + "learning_rate": 3.815175922000848e-07, + "loss": 0.4401, + "mean_token_accuracy": 0.8554513454437256, + "num_tokens": 34301705.0, + "step": 901 + }, + { + "epoch": 0.11474367128864012, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 1.8200913667678833, + "learning_rate": 3.8194150063586264e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8364647626876831, + "num_tokens": 34340975.0, + "step": 902 + }, + { + "epoch": 0.11487088156723063, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 1.840836763381958, + "learning_rate": 3.823654090716405e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8390566110610962, + "num_tokens": 34380024.0, + "step": 903 + }, + { + "epoch": 0.11499809184582115, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 1.8348584175109863, + "learning_rate": 3.827893175074184e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8420150279998779, + "num_tokens": 34419654.0, + "step": 904 + }, + { + "epoch": 0.11512530212441165, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 1.8980189561843872, + "learning_rate": 3.832132259431963e-07, + "loss": 0.444, + "mean_token_accuracy": 0.857921838760376, + "num_tokens": 34453034.0, + "step": 905 + }, + { + "epoch": 0.11525251240300216, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 1.7541528940200806, + "learning_rate": 3.8363713437897413e-07, + "loss": 0.5463, + "mean_token_accuracy": 0.8242438435554504, + "num_tokens": 34493694.0, + "step": 906 + }, + { + "epoch": 0.11537972268159268, + "ewc_loss": 1.735985279083252e-06, + "grad_norm": 1.8181942701339722, + "learning_rate": 3.8406104281475197e-07, + "loss": 0.5378, + "mean_token_accuracy": 0.8297427296638489, + "num_tokens": 34531636.0, + "step": 907 + }, + { + "epoch": 0.11550693296018319, + "ewc_loss": 1.735985279083252e-06, + "grad_norm": 1.8109999895095825, + "learning_rate": 3.8448495125052987e-07, + "loss": 0.5235, + "mean_token_accuracy": 0.830021858215332, + "num_tokens": 34570851.0, + "step": 908 + }, + { + "epoch": 0.11563414323877369, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 1.69517982006073, + "learning_rate": 3.8490885968630777e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.8475011587142944, + "num_tokens": 34613314.0, + "step": 909 + }, + { + "epoch": 0.1157613535173642, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 1.6874223947525024, + "learning_rate": 3.853327681220856e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8403671979904175, + "num_tokens": 34656701.0, + "step": 910 + }, + { + "epoch": 0.11588856379595472, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 1.952980637550354, + "learning_rate": 3.8575667655786346e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8508031368255615, + "num_tokens": 34688595.0, + "step": 911 + }, + { + "epoch": 0.11601577407454522, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 1.6455326080322266, + "learning_rate": 3.8618058499364136e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8452465534210205, + "num_tokens": 34726609.0, + "step": 912 + }, + { + "epoch": 0.11614298435313573, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 1.7572247982025146, + "learning_rate": 3.8660449342941926e-07, + "loss": 0.5403, + "mean_token_accuracy": 0.831041693687439, + "num_tokens": 34769213.0, + "step": 913 + }, + { + "epoch": 0.11627019463172625, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 1.798376202583313, + "learning_rate": 3.870284018651971e-07, + "loss": 0.5149, + "mean_token_accuracy": 0.8356679677963257, + "num_tokens": 34806722.0, + "step": 914 + }, + { + "epoch": 0.11639740491031675, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 1.7334080934524536, + "learning_rate": 3.8745231030097495e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8510055541992188, + "num_tokens": 34846408.0, + "step": 915 + }, + { + "epoch": 0.11652461518890726, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 1.7532302141189575, + "learning_rate": 3.8787621873675285e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8503484129905701, + "num_tokens": 34883791.0, + "step": 916 + }, + { + "epoch": 0.11665182546749778, + "ewc_loss": 1.7583370208740234e-06, + "grad_norm": 1.702265739440918, + "learning_rate": 3.883001271725307e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8531422019004822, + "num_tokens": 34922768.0, + "step": 917 + }, + { + "epoch": 0.11677903574608828, + "ewc_loss": 1.7583370208740234e-06, + "grad_norm": 1.6805096864700317, + "learning_rate": 3.887240356083086e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.8579322099685669, + "num_tokens": 34962184.0, + "step": 918 + }, + { + "epoch": 0.11690624602467879, + "ewc_loss": 1.7657876014709473e-06, + "grad_norm": 2.0260677337646484, + "learning_rate": 3.8914794404408644e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8447725176811218, + "num_tokens": 34991455.0, + "step": 919 + }, + { + "epoch": 0.1170334563032693, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 1.9586769342422485, + "learning_rate": 3.8957185247986434e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8362524509429932, + "num_tokens": 35025044.0, + "step": 920 + }, + { + "epoch": 0.11716066658185982, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 1.824992060661316, + "learning_rate": 3.899957609156422e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.8448033332824707, + "num_tokens": 35064564.0, + "step": 921 + }, + { + "epoch": 0.11728787686045032, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 1.6627178192138672, + "learning_rate": 3.904196693514201e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8413252830505371, + "num_tokens": 35109786.0, + "step": 922 + }, + { + "epoch": 0.11741508713904084, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 1.929521083831787, + "learning_rate": 3.9084357778719793e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8215823173522949, + "num_tokens": 35148351.0, + "step": 923 + }, + { + "epoch": 0.11754229741763135, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 1.842130184173584, + "learning_rate": 3.9126748622297583e-07, + "loss": 0.5227, + "mean_token_accuracy": 0.8336445689201355, + "num_tokens": 35183464.0, + "step": 924 + }, + { + "epoch": 0.11766950769622185, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 1.791264533996582, + "learning_rate": 3.916913946587537e-07, + "loss": 0.5063, + "mean_token_accuracy": 0.8360236883163452, + "num_tokens": 35218561.0, + "step": 925 + }, + { + "epoch": 0.11779671797481236, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 1.8465287685394287, + "learning_rate": 3.921153030945316e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8326804637908936, + "num_tokens": 35257740.0, + "step": 926 + }, + { + "epoch": 0.11792392825340288, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 2.4214231967926025, + "learning_rate": 3.925392115303094e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.8469289541244507, + "num_tokens": 35291632.0, + "step": 927 + }, + { + "epoch": 0.11805113853199338, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 1.8737947940826416, + "learning_rate": 3.929631199660873e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8413544297218323, + "num_tokens": 35327686.0, + "step": 928 + }, + { + "epoch": 0.1181783488105839, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 1.7747255563735962, + "learning_rate": 3.9338702840186517e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8424162864685059, + "num_tokens": 35364480.0, + "step": 929 + }, + { + "epoch": 0.11830555908917441, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 1.6898399591445923, + "learning_rate": 3.9381093683764307e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8426572680473328, + "num_tokens": 35406017.0, + "step": 930 + }, + { + "epoch": 0.11843276936776491, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 1.7340383529663086, + "learning_rate": 3.942348452734209e-07, + "loss": 0.5461, + "mean_token_accuracy": 0.8248109221458435, + "num_tokens": 35448447.0, + "step": 931 + }, + { + "epoch": 0.11855997964635542, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 1.7000138759613037, + "learning_rate": 3.946587537091988e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8391504883766174, + "num_tokens": 35488120.0, + "step": 932 + }, + { + "epoch": 0.11868718992494594, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 1.6505789756774902, + "learning_rate": 3.9508266214497666e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8480329513549805, + "num_tokens": 35531759.0, + "step": 933 + }, + { + "epoch": 0.11881440020353645, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 1.6471178531646729, + "learning_rate": 3.9550657058075456e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8543906211853027, + "num_tokens": 35571476.0, + "step": 934 + }, + { + "epoch": 0.11894161048212695, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 1.8902124166488647, + "learning_rate": 3.959304790165324e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.8414437174797058, + "num_tokens": 35608440.0, + "step": 935 + }, + { + "epoch": 0.11906882076071747, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 1.8418238162994385, + "learning_rate": 3.9635438745231025e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8425041437149048, + "num_tokens": 35641861.0, + "step": 936 + }, + { + "epoch": 0.11919603103930798, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 1.7151530981063843, + "learning_rate": 3.9677829588808815e-07, + "loss": 0.511, + "mean_token_accuracy": 0.834863007068634, + "num_tokens": 35684626.0, + "step": 937 + }, + { + "epoch": 0.11932324131789848, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 1.8675466775894165, + "learning_rate": 3.9720220432386605e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.832301139831543, + "num_tokens": 35718125.0, + "step": 938 + }, + { + "epoch": 0.119450451596489, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 1.7812221050262451, + "learning_rate": 3.976261127596439e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8527366518974304, + "num_tokens": 35752818.0, + "step": 939 + }, + { + "epoch": 0.11957766187507951, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 1.8664124011993408, + "learning_rate": 3.9805002119542174e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8568630814552307, + "num_tokens": 35787097.0, + "step": 940 + }, + { + "epoch": 0.11970487215367001, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 1.7761636972427368, + "learning_rate": 3.9847392963119964e-07, + "loss": 0.5378, + "mean_token_accuracy": 0.8303860425949097, + "num_tokens": 35825887.0, + "step": 941 + }, + { + "epoch": 0.11983208243226053, + "ewc_loss": 1.8104910850524902e-06, + "grad_norm": 1.7697365283966064, + "learning_rate": 3.9889783806697754e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8510144948959351, + "num_tokens": 35863592.0, + "step": 942 + }, + { + "epoch": 0.11995929271085104, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 1.970906376838684, + "learning_rate": 3.993217465027554e-07, + "loss": 0.5433, + "mean_token_accuracy": 0.8301840424537659, + "num_tokens": 35904483.0, + "step": 943 + }, + { + "epoch": 0.12008650298944154, + "ewc_loss": 1.8104910850524902e-06, + "grad_norm": 1.782488226890564, + "learning_rate": 3.9974565493853323e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8602376580238342, + "num_tokens": 35938662.0, + "step": 944 + }, + { + "epoch": 0.12021371326803205, + "ewc_loss": 1.8104910850524902e-06, + "grad_norm": 1.8189631700515747, + "learning_rate": 4.0016956337431113e-07, + "loss": 0.495, + "mean_token_accuracy": 0.8413410782814026, + "num_tokens": 35975176.0, + "step": 945 + }, + { + "epoch": 0.12034092354662257, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.9089744091033936, + "learning_rate": 4.0059347181008903e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.8440570831298828, + "num_tokens": 36008893.0, + "step": 946 + }, + { + "epoch": 0.12046813382521308, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.5755244493484497, + "learning_rate": 4.010173802458669e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8585355281829834, + "num_tokens": 36049882.0, + "step": 947 + }, + { + "epoch": 0.12059534410380358, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.7201759815216064, + "learning_rate": 4.014412886816447e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8258039355278015, + "num_tokens": 36090615.0, + "step": 948 + }, + { + "epoch": 0.1207225543823941, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.6856346130371094, + "learning_rate": 4.018651971174226e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8436917066574097, + "num_tokens": 36128159.0, + "step": 949 + }, + { + "epoch": 0.12084976466098461, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 1.6710609197616577, + "learning_rate": 4.022891055532005e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8401660919189453, + "num_tokens": 36172851.0, + "step": 950 + }, + { + "epoch": 0.12097697493957511, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 1.737249493598938, + "learning_rate": 4.0271301398897837e-07, + "loss": 0.5477, + "mean_token_accuracy": 0.828151285648346, + "num_tokens": 36214282.0, + "step": 951 + }, + { + "epoch": 0.12110418521816563, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.875752329826355, + "learning_rate": 4.031369224247562e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8440679311752319, + "num_tokens": 36248619.0, + "step": 952 + }, + { + "epoch": 0.12123139549675614, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.5877399444580078, + "learning_rate": 4.035608308605341e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8364337682723999, + "num_tokens": 36289070.0, + "step": 953 + }, + { + "epoch": 0.12135860577534664, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.7195210456848145, + "learning_rate": 4.03984739296312e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.839878261089325, + "num_tokens": 36330883.0, + "step": 954 + }, + { + "epoch": 0.12148581605393716, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.7205357551574707, + "learning_rate": 4.044086477320898e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8455849289894104, + "num_tokens": 36371507.0, + "step": 955 + }, + { + "epoch": 0.12161302633252767, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.8309046030044556, + "learning_rate": 4.048325561678677e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8368439674377441, + "num_tokens": 36407262.0, + "step": 956 + }, + { + "epoch": 0.12174023661111817, + "ewc_loss": 1.8328428268432617e-06, + "grad_norm": 1.7230333089828491, + "learning_rate": 4.052564646036456e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8370038270950317, + "num_tokens": 36448089.0, + "step": 957 + }, + { + "epoch": 0.12186744688970869, + "ewc_loss": 1.8328428268432617e-06, + "grad_norm": 1.8438161611557007, + "learning_rate": 4.056803730394235e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8312644958496094, + "num_tokens": 36484022.0, + "step": 958 + }, + { + "epoch": 0.1219946571682992, + "ewc_loss": 1.8328428268432617e-06, + "grad_norm": 1.8079190254211426, + "learning_rate": 4.061042814752013e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8424365520477295, + "num_tokens": 36519385.0, + "step": 959 + }, + { + "epoch": 0.12212186744688971, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.9645322561264038, + "learning_rate": 4.065281899109792e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.8380721211433411, + "num_tokens": 36551943.0, + "step": 960 + }, + { + "epoch": 0.12224907772548022, + "ewc_loss": 1.8402934074401855e-06, + "grad_norm": 1.6595323085784912, + "learning_rate": 4.069520983467571e-07, + "loss": 0.4463, + "mean_token_accuracy": 0.856849193572998, + "num_tokens": 36590927.0, + "step": 961 + }, + { + "epoch": 0.12237628800407073, + "ewc_loss": 1.8402934074401855e-06, + "grad_norm": 1.8330011367797852, + "learning_rate": 4.07376006782535e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8442428708076477, + "num_tokens": 36625199.0, + "step": 962 + }, + { + "epoch": 0.12250349828266124, + "ewc_loss": 1.8477439880371094e-06, + "grad_norm": 1.7953064441680908, + "learning_rate": 4.077999152183128e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8567427396774292, + "num_tokens": 36658981.0, + "step": 963 + }, + { + "epoch": 0.12263070856125174, + "ewc_loss": 1.8477439880371094e-06, + "grad_norm": 1.7120275497436523, + "learning_rate": 4.082238236540907e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8432348966598511, + "num_tokens": 36704214.0, + "step": 964 + }, + { + "epoch": 0.12275791883984226, + "ewc_loss": 1.8477439880371094e-06, + "grad_norm": 1.6750619411468506, + "learning_rate": 4.086477320898686e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8475260734558105, + "num_tokens": 36744782.0, + "step": 965 + }, + { + "epoch": 0.12288512911843277, + "ewc_loss": 1.8477439880371094e-06, + "grad_norm": 1.7073562145233154, + "learning_rate": 4.090716405256465e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8367069959640503, + "num_tokens": 36783645.0, + "step": 966 + }, + { + "epoch": 0.12301233939702327, + "ewc_loss": 1.8477439880371094e-06, + "grad_norm": 1.8890060186386108, + "learning_rate": 4.094955489614243e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.8463239073753357, + "num_tokens": 36817539.0, + "step": 967 + }, + { + "epoch": 0.12313954967561379, + "ewc_loss": 1.862645149230957e-06, + "grad_norm": 1.6629770994186401, + "learning_rate": 4.099194573972022e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8463796377182007, + "num_tokens": 36859144.0, + "step": 968 + }, + { + "epoch": 0.1232667599542043, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 1.7127119302749634, + "learning_rate": 4.1034336583298007e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8342066407203674, + "num_tokens": 36900833.0, + "step": 969 + }, + { + "epoch": 0.1233939702327948, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 1.678109884262085, + "learning_rate": 4.1076727426875797e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8376802206039429, + "num_tokens": 36940935.0, + "step": 970 + }, + { + "epoch": 0.12352118051138532, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 1.8146005868911743, + "learning_rate": 4.1119118270453577e-07, + "loss": 0.5671, + "mean_token_accuracy": 0.8174118995666504, + "num_tokens": 36979874.0, + "step": 971 + }, + { + "epoch": 0.12364839078997583, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.764425277709961, + "learning_rate": 4.1161509114031366e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8569556474685669, + "num_tokens": 37017917.0, + "step": 972 + }, + { + "epoch": 0.12377560106856635, + "ewc_loss": 1.8849968910217285e-06, + "grad_norm": 1.8286510705947876, + "learning_rate": 4.1203899957609156e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8364053964614868, + "num_tokens": 37054914.0, + "step": 973 + }, + { + "epoch": 0.12390281134715685, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.8584142923355103, + "learning_rate": 4.124629080118694e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8468576669692993, + "num_tokens": 37088277.0, + "step": 974 + }, + { + "epoch": 0.12403002162574736, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.7992990016937256, + "learning_rate": 4.1288681644764726e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8357364535331726, + "num_tokens": 37126963.0, + "step": 975 + }, + { + "epoch": 0.12415723190433788, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.8301386833190918, + "learning_rate": 4.1331072488342515e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.8345405459403992, + "num_tokens": 37161948.0, + "step": 976 + }, + { + "epoch": 0.12428444218292838, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.6990821361541748, + "learning_rate": 4.1373463331920305e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.8593959808349609, + "num_tokens": 37198552.0, + "step": 977 + }, + { + "epoch": 0.12441165246151889, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 1.7698440551757812, + "learning_rate": 4.141585417549809e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8354331254959106, + "num_tokens": 37235990.0, + "step": 978 + }, + { + "epoch": 0.1245388627401094, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 1.8621021509170532, + "learning_rate": 4.1458245019075875e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.847666323184967, + "num_tokens": 37269664.0, + "step": 979 + }, + { + "epoch": 0.1246660730186999, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 1.6944512128829956, + "learning_rate": 4.1500635862653664e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8411142230033875, + "num_tokens": 37308647.0, + "step": 980 + }, + { + "epoch": 0.12479328329729042, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.7084660530090332, + "learning_rate": 4.1543026706231454e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.8560875654220581, + "num_tokens": 37345953.0, + "step": 981 + }, + { + "epoch": 0.12492049357588093, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 1.739113211631775, + "learning_rate": 4.158541754980924e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.8593774437904358, + "num_tokens": 37382735.0, + "step": 982 + }, + { + "epoch": 0.12504770385447145, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.8675330877304077, + "learning_rate": 4.1627808393387024e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.834023118019104, + "num_tokens": 37419605.0, + "step": 983 + }, + { + "epoch": 0.12517491413306195, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.895469307899475, + "learning_rate": 4.1670199236964813e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.8340764045715332, + "num_tokens": 37455395.0, + "step": 984 + }, + { + "epoch": 0.12530212441165245, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 2.042680501937866, + "learning_rate": 4.1712590080542603e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.852480947971344, + "num_tokens": 37492208.0, + "step": 985 + }, + { + "epoch": 0.12542933469024298, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.67524254322052, + "learning_rate": 4.175498092412039e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8411468863487244, + "num_tokens": 37533675.0, + "step": 986 + }, + { + "epoch": 0.12555654496883348, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.7164386510849, + "learning_rate": 4.179737176769817e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8503377437591553, + "num_tokens": 37572792.0, + "step": 987 + }, + { + "epoch": 0.12568375524742398, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.6946449279785156, + "learning_rate": 4.183976261127596e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.8485770225524902, + "num_tokens": 37611481.0, + "step": 988 + }, + { + "epoch": 0.1258109655260145, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.8960968255996704, + "learning_rate": 4.1882153454853747e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.8264825344085693, + "num_tokens": 37648253.0, + "step": 989 + }, + { + "epoch": 0.125938175804605, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.6930402517318726, + "learning_rate": 4.1924544298431537e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8474729061126709, + "num_tokens": 37687086.0, + "step": 990 + }, + { + "epoch": 0.12606538608319554, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 1.9741084575653076, + "learning_rate": 4.196693514200932e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8494465947151184, + "num_tokens": 37730928.0, + "step": 991 + }, + { + "epoch": 0.12619259636178604, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 2.1902270317077637, + "learning_rate": 4.200932598558711e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8392750024795532, + "num_tokens": 37771857.0, + "step": 992 + }, + { + "epoch": 0.12631980664037654, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.6974890232086182, + "learning_rate": 4.2051716829164896e-07, + "loss": 0.495, + "mean_token_accuracy": 0.8431928157806396, + "num_tokens": 37817517.0, + "step": 993 + }, + { + "epoch": 0.12644701691896706, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.6959106922149658, + "learning_rate": 4.2094107672742686e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.833017110824585, + "num_tokens": 37855891.0, + "step": 994 + }, + { + "epoch": 0.12657422719755757, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.9314910173416138, + "learning_rate": 4.2136498516320476e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8553627729415894, + "num_tokens": 37888719.0, + "step": 995 + }, + { + "epoch": 0.12670143747614807, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 1.7304474115371704, + "learning_rate": 4.217888935989826e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.835476279258728, + "num_tokens": 37932968.0, + "step": 996 + }, + { + "epoch": 0.1268286477547386, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 1.7701103687286377, + "learning_rate": 4.2221280203476045e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8373417258262634, + "num_tokens": 37974297.0, + "step": 997 + }, + { + "epoch": 0.1269558580333291, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 1.6806007623672485, + "learning_rate": 4.2263671047053835e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8340002298355103, + "num_tokens": 38020294.0, + "step": 998 + }, + { + "epoch": 0.1270830683119196, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 1.605129361152649, + "learning_rate": 4.2306061890631625e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8413659334182739, + "num_tokens": 38063322.0, + "step": 999 + }, + { + "epoch": 0.12721027859051012, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 1.8305647373199463, + "learning_rate": 4.234845273420941e-07, + "loss": 0.5842, + "mean_token_accuracy": 0.8209104537963867, + "num_tokens": 38101052.0, + "step": 1000 + }, + { + "epoch": 0.12733748886910062, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.9378864765167236, + "learning_rate": 4.2390843577787194e-07, + "loss": 0.5063, + "mean_token_accuracy": 0.8356655836105347, + "num_tokens": 38145472.0, + "step": 1001 + }, + { + "epoch": 0.12746469914769112, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 1.807761311531067, + "learning_rate": 4.2433234421364984e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8440687656402588, + "num_tokens": 38182662.0, + "step": 1002 + }, + { + "epoch": 0.12759190942628165, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 1.7641558647155762, + "learning_rate": 4.2475625264942774e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8492605686187744, + "num_tokens": 38224735.0, + "step": 1003 + }, + { + "epoch": 0.12771911970487215, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 1.9848333597183228, + "learning_rate": 4.251801610852056e-07, + "loss": 0.5127, + "mean_token_accuracy": 0.8355343341827393, + "num_tokens": 38255847.0, + "step": 1004 + }, + { + "epoch": 0.12784632998346265, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 1.7861863374710083, + "learning_rate": 4.2560406952098343e-07, + "loss": 0.5252, + "mean_token_accuracy": 0.8345994353294373, + "num_tokens": 38296921.0, + "step": 1005 + }, + { + "epoch": 0.12797354026205318, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 1.7301697731018066, + "learning_rate": 4.2602797795676133e-07, + "loss": 0.5367, + "mean_token_accuracy": 0.8303500413894653, + "num_tokens": 38336408.0, + "step": 1006 + }, + { + "epoch": 0.12810075054064368, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 1.7245726585388184, + "learning_rate": 4.2645188639253923e-07, + "loss": 0.4488, + "mean_token_accuracy": 0.8532258868217468, + "num_tokens": 38372809.0, + "step": 1007 + }, + { + "epoch": 0.12822796081923418, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.8471112251281738, + "learning_rate": 4.26875794828317e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8376405239105225, + "num_tokens": 38413312.0, + "step": 1008 + }, + { + "epoch": 0.1283551710978247, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.6642253398895264, + "learning_rate": 4.272997032640949e-07, + "loss": 0.523, + "mean_token_accuracy": 0.8324956297874451, + "num_tokens": 38457710.0, + "step": 1009 + }, + { + "epoch": 0.1284823813764152, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.7438158988952637, + "learning_rate": 4.277236116998728e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.845619261264801, + "num_tokens": 38499630.0, + "step": 1010 + }, + { + "epoch": 0.1286095916550057, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.7514163255691528, + "learning_rate": 4.281475201356507e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.8428372144699097, + "num_tokens": 38538133.0, + "step": 1011 + }, + { + "epoch": 0.12873680193359624, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.8325592279434204, + "learning_rate": 4.285714285714285e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8334667682647705, + "num_tokens": 38576816.0, + "step": 1012 + }, + { + "epoch": 0.12886401221218674, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.747849702835083, + "learning_rate": 4.289953370072064e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8498321771621704, + "num_tokens": 38615270.0, + "step": 1013 + }, + { + "epoch": 0.12899122249077727, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.7821192741394043, + "learning_rate": 4.294192454429843e-07, + "loss": 0.515, + "mean_token_accuracy": 0.8335004448890686, + "num_tokens": 38654173.0, + "step": 1014 + }, + { + "epoch": 0.12911843276936777, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.6032836437225342, + "learning_rate": 4.298431538787622e-07, + "loss": 0.4025, + "mean_token_accuracy": 0.8714473843574524, + "num_tokens": 38695758.0, + "step": 1015 + }, + { + "epoch": 0.12924564304795827, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.8583815097808838, + "learning_rate": 4.3026706231454e-07, + "loss": 0.5452, + "mean_token_accuracy": 0.8259844779968262, + "num_tokens": 38729524.0, + "step": 1016 + }, + { + "epoch": 0.1293728533265488, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 1.778909683227539, + "learning_rate": 4.306909707503179e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8451169729232788, + "num_tokens": 38767500.0, + "step": 1017 + }, + { + "epoch": 0.1295000636051393, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 1.9975826740264893, + "learning_rate": 4.311148791860958e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8431556224822998, + "num_tokens": 38798683.0, + "step": 1018 + }, + { + "epoch": 0.1296272738837298, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.859090805053711, + "learning_rate": 4.315387876218737e-07, + "loss": 0.531, + "mean_token_accuracy": 0.8333178162574768, + "num_tokens": 38833653.0, + "step": 1019 + }, + { + "epoch": 0.12975448416232033, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.9270482063293457, + "learning_rate": 4.319626960576515e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.8290582895278931, + "num_tokens": 38869428.0, + "step": 1020 + }, + { + "epoch": 0.12988169444091083, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.6545881032943726, + "learning_rate": 4.323866044934294e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8467711806297302, + "num_tokens": 38909502.0, + "step": 1021 + }, + { + "epoch": 0.13000890471950133, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.7143785953521729, + "learning_rate": 4.328105129292073e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8455858826637268, + "num_tokens": 38951730.0, + "step": 1022 + }, + { + "epoch": 0.13013611499809186, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 1.7829217910766602, + "learning_rate": 4.332344213649852e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8474562168121338, + "num_tokens": 38988981.0, + "step": 1023 + }, + { + "epoch": 0.13026332527668236, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.8564141988754272, + "learning_rate": 4.33658329800763e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8467100858688354, + "num_tokens": 39026289.0, + "step": 1024 + }, + { + "epoch": 0.13039053555527286, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 1.6796432733535767, + "learning_rate": 4.340822382365409e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8481154441833496, + "num_tokens": 39069113.0, + "step": 1025 + }, + { + "epoch": 0.13051774583386339, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 1.7118277549743652, + "learning_rate": 4.345061466723188e-07, + "loss": 0.4519, + "mean_token_accuracy": 0.8545353412628174, + "num_tokens": 39112991.0, + "step": 1026 + }, + { + "epoch": 0.13064495611245389, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 1.7509409189224243, + "learning_rate": 4.3493005510809663e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8374825716018677, + "num_tokens": 39149593.0, + "step": 1027 + }, + { + "epoch": 0.1307721663910444, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 2.1075494289398193, + "learning_rate": 4.353539635438745e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8453146815299988, + "num_tokens": 39186570.0, + "step": 1028 + }, + { + "epoch": 0.13089937666963491, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 1.9120186567306519, + "learning_rate": 4.357778719796524e-07, + "loss": 0.5412, + "mean_token_accuracy": 0.8243986368179321, + "num_tokens": 39220666.0, + "step": 1029 + }, + { + "epoch": 0.13102658694822542, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 1.720892071723938, + "learning_rate": 4.362017804154303e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8541046977043152, + "num_tokens": 39258372.0, + "step": 1030 + }, + { + "epoch": 0.13115379722681592, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 1.8143876791000366, + "learning_rate": 4.366256888512081e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8544107675552368, + "num_tokens": 39293647.0, + "step": 1031 + }, + { + "epoch": 0.13128100750540644, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 1.668308973312378, + "learning_rate": 4.3704959728698597e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8426210880279541, + "num_tokens": 39332811.0, + "step": 1032 + }, + { + "epoch": 0.13140821778399694, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.8494386672973633, + "learning_rate": 4.3747350572276386e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8423546552658081, + "num_tokens": 39365971.0, + "step": 1033 + }, + { + "epoch": 0.13153542806258745, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 2.0015087127685547, + "learning_rate": 4.3789741415854176e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8418091535568237, + "num_tokens": 39403178.0, + "step": 1034 + }, + { + "epoch": 0.13166263834117797, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.7006988525390625, + "learning_rate": 4.383213225943196e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8414770364761353, + "num_tokens": 39441303.0, + "step": 1035 + }, + { + "epoch": 0.13178984861976847, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.766770839691162, + "learning_rate": 4.3874523103009746e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.8612863421440125, + "num_tokens": 39477359.0, + "step": 1036 + }, + { + "epoch": 0.13191705889835897, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.7184267044067383, + "learning_rate": 4.3916913946587536e-07, + "loss": 0.4452, + "mean_token_accuracy": 0.8533747792243958, + "num_tokens": 39511687.0, + "step": 1037 + }, + { + "epoch": 0.1320442691769495, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.7495508193969727, + "learning_rate": 4.3959304790165325e-07, + "loss": 0.4595, + "mean_token_accuracy": 0.8488659262657166, + "num_tokens": 39547453.0, + "step": 1038 + }, + { + "epoch": 0.13217147945554, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.637109637260437, + "learning_rate": 4.400169563374311e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8454525470733643, + "num_tokens": 39588934.0, + "step": 1039 + }, + { + "epoch": 0.13229868973413053, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.8309991359710693, + "learning_rate": 4.4044086477320895e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.8234102725982666, + "num_tokens": 39627638.0, + "step": 1040 + }, + { + "epoch": 0.13242590001272103, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.8248063325881958, + "learning_rate": 4.4086477320898685e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8439341187477112, + "num_tokens": 39661696.0, + "step": 1041 + }, + { + "epoch": 0.13255311029131153, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.6630430221557617, + "learning_rate": 4.4128868164476474e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8481532335281372, + "num_tokens": 39702357.0, + "step": 1042 + }, + { + "epoch": 0.13268032056990206, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.7715566158294678, + "learning_rate": 4.417125900805426e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8501965999603271, + "num_tokens": 39744960.0, + "step": 1043 + }, + { + "epoch": 0.13280753084849256, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.8293123245239258, + "learning_rate": 4.4213649851632044e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8359039425849915, + "num_tokens": 39785394.0, + "step": 1044 + }, + { + "epoch": 0.13293474112708306, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.8174992799758911, + "learning_rate": 4.4256040695209834e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.839939296245575, + "num_tokens": 39822698.0, + "step": 1045 + }, + { + "epoch": 0.1330619514056736, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.8955349922180176, + "learning_rate": 4.429843153878762e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.836336612701416, + "num_tokens": 39856874.0, + "step": 1046 + }, + { + "epoch": 0.1331891616842641, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.6861549615859985, + "learning_rate": 4.434082238236541e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.8285996317863464, + "num_tokens": 39899683.0, + "step": 1047 + }, + { + "epoch": 0.1333163719628546, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.6633999347686768, + "learning_rate": 4.4383213225943193e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8381425142288208, + "num_tokens": 39940565.0, + "step": 1048 + }, + { + "epoch": 0.13344358224144512, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.8026314973831177, + "learning_rate": 4.442560406952098e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8336001634597778, + "num_tokens": 39979010.0, + "step": 1049 + }, + { + "epoch": 0.13357079252003562, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.672773838043213, + "learning_rate": 4.4467994913098767e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8515820503234863, + "num_tokens": 40021151.0, + "step": 1050 + }, + { + "epoch": 0.13369800279862612, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.8514472246170044, + "learning_rate": 4.4510385756676557e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8301785588264465, + "num_tokens": 40059324.0, + "step": 1051 + }, + { + "epoch": 0.13382521307721665, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 1.55087411403656, + "learning_rate": 4.455277660025434e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8454586267471313, + "num_tokens": 40104577.0, + "step": 1052 + }, + { + "epoch": 0.13395242335580715, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.7878470420837402, + "learning_rate": 4.459516744383213e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8360852599143982, + "num_tokens": 40139734.0, + "step": 1053 + }, + { + "epoch": 0.13407963363439765, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.766964316368103, + "learning_rate": 4.4637558287409916e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8379372358322144, + "num_tokens": 40180724.0, + "step": 1054 + }, + { + "epoch": 0.13420684391298818, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.7720329761505127, + "learning_rate": 4.4679949130987706e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8441115617752075, + "num_tokens": 40220475.0, + "step": 1055 + }, + { + "epoch": 0.13433405419157868, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.932740330696106, + "learning_rate": 4.472233997456549e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.8392150402069092, + "num_tokens": 40258208.0, + "step": 1056 + }, + { + "epoch": 0.13446126447016918, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.7189732789993286, + "learning_rate": 4.476473081814328e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.84063720703125, + "num_tokens": 40299127.0, + "step": 1057 + }, + { + "epoch": 0.1345884747487597, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.7418791055679321, + "learning_rate": 4.4807121661721065e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.851255476474762, + "num_tokens": 40339331.0, + "step": 1058 + }, + { + "epoch": 0.1347156850273502, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.8282134532928467, + "learning_rate": 4.4849512505298855e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8396105766296387, + "num_tokens": 40379386.0, + "step": 1059 + }, + { + "epoch": 0.1348428953059407, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.7057020664215088, + "learning_rate": 4.489190334887664e-07, + "loss": 0.5306, + "mean_token_accuracy": 0.835913896560669, + "num_tokens": 40419868.0, + "step": 1060 + }, + { + "epoch": 0.13497010558453124, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.7460057735443115, + "learning_rate": 4.493429419245443e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8413287401199341, + "num_tokens": 40455346.0, + "step": 1061 + }, + { + "epoch": 0.13509731586312174, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.7421561479568481, + "learning_rate": 4.4976685036032214e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.850603461265564, + "num_tokens": 40490087.0, + "step": 1062 + }, + { + "epoch": 0.13522452614171224, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.8837321996688843, + "learning_rate": 4.5019075879610004e-07, + "loss": 0.5178, + "mean_token_accuracy": 0.83048415184021, + "num_tokens": 40524799.0, + "step": 1063 + }, + { + "epoch": 0.13535173642030277, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.011430263519287, + "learning_rate": 4.506146672318779e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8413363695144653, + "num_tokens": 40563740.0, + "step": 1064 + }, + { + "epoch": 0.13547894669889327, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.7694449424743652, + "learning_rate": 4.5103857566765573e-07, + "loss": 0.5439, + "mean_token_accuracy": 0.830780565738678, + "num_tokens": 40604743.0, + "step": 1065 + }, + { + "epoch": 0.1356061569774838, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.7194327116012573, + "learning_rate": 4.5146248410343363e-07, + "loss": 0.4483, + "mean_token_accuracy": 0.854924201965332, + "num_tokens": 40640909.0, + "step": 1066 + }, + { + "epoch": 0.1357333672560743, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.6319714784622192, + "learning_rate": 4.5188639253921153e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8524541258811951, + "num_tokens": 40678930.0, + "step": 1067 + }, + { + "epoch": 0.1358605775346648, + "ewc_loss": 2.1010637283325195e-06, + "grad_norm": 1.8475594520568848, + "learning_rate": 4.523103009749894e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8533629775047302, + "num_tokens": 40715609.0, + "step": 1068 + }, + { + "epoch": 0.13598778781325532, + "ewc_loss": 2.1010637283325195e-06, + "grad_norm": 1.618532419204712, + "learning_rate": 4.527342094107672e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8458674550056458, + "num_tokens": 40760533.0, + "step": 1069 + }, + { + "epoch": 0.13611499809184582, + "ewc_loss": 2.1010637283325195e-06, + "grad_norm": 1.8034542798995972, + "learning_rate": 4.531581178465451e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8392430543899536, + "num_tokens": 40800039.0, + "step": 1070 + }, + { + "epoch": 0.13624220837043632, + "ewc_loss": 2.1010637283325195e-06, + "grad_norm": 1.6986356973648071, + "learning_rate": 4.53582026282323e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8494530320167542, + "num_tokens": 40840207.0, + "step": 1071 + }, + { + "epoch": 0.13636941864902685, + "ewc_loss": 2.1010637283325195e-06, + "grad_norm": 1.8808640241622925, + "learning_rate": 4.5400593471810087e-07, + "loss": 0.5494, + "mean_token_accuracy": 0.8242752552032471, + "num_tokens": 40879559.0, + "step": 1072 + }, + { + "epoch": 0.13649662892761735, + "ewc_loss": 2.1010637283325195e-06, + "grad_norm": 1.6949193477630615, + "learning_rate": 4.544298431538787e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8466614484786987, + "num_tokens": 40919608.0, + "step": 1073 + }, + { + "epoch": 0.13662383920620785, + "ewc_loss": 2.1010637283325195e-06, + "grad_norm": 1.9153079986572266, + "learning_rate": 4.548537515896566e-07, + "loss": 0.5357, + "mean_token_accuracy": 0.8278622031211853, + "num_tokens": 40953648.0, + "step": 1074 + }, + { + "epoch": 0.13675104948479838, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.9131956100463867, + "learning_rate": 4.552776600254345e-07, + "loss": 0.5086, + "mean_token_accuracy": 0.8372794985771179, + "num_tokens": 40989675.0, + "step": 1075 + }, + { + "epoch": 0.13687825976338888, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.73691725730896, + "learning_rate": 4.5570156846121236e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.8512106537818909, + "num_tokens": 41027992.0, + "step": 1076 + }, + { + "epoch": 0.13700547004197938, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.8030673265457153, + "learning_rate": 4.561254768969902e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8488478660583496, + "num_tokens": 41066925.0, + "step": 1077 + }, + { + "epoch": 0.1371326803205699, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.7583459615707397, + "learning_rate": 4.565493853327681e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8490298986434937, + "num_tokens": 41105090.0, + "step": 1078 + }, + { + "epoch": 0.1372598905991604, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 1.999869465827942, + "learning_rate": 4.56973293768546e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8394341468811035, + "num_tokens": 41141924.0, + "step": 1079 + }, + { + "epoch": 0.1373871008777509, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 1.8057608604431152, + "learning_rate": 4.573972022043238e-07, + "loss": 0.5709, + "mean_token_accuracy": 0.8188233375549316, + "num_tokens": 41181512.0, + "step": 1080 + }, + { + "epoch": 0.13751431115634144, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 1.7381365299224854, + "learning_rate": 4.578211106401017e-07, + "loss": 0.5195, + "mean_token_accuracy": 0.8338919878005981, + "num_tokens": 41224015.0, + "step": 1081 + }, + { + "epoch": 0.13764152143493194, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 1.8074783086776733, + "learning_rate": 4.582450190758796e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.8346962928771973, + "num_tokens": 41265568.0, + "step": 1082 + }, + { + "epoch": 0.13776873171352244, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 1.762805461883545, + "learning_rate": 4.586689275116575e-07, + "loss": 0.438, + "mean_token_accuracy": 0.8574874997138977, + "num_tokens": 41303595.0, + "step": 1083 + }, + { + "epoch": 0.13789594199211297, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 1.7808164358139038, + "learning_rate": 4.590928359474353e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8296186923980713, + "num_tokens": 41340119.0, + "step": 1084 + }, + { + "epoch": 0.13802315227070347, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 1.809899091720581, + "learning_rate": 4.595167443832132e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8454153537750244, + "num_tokens": 41380184.0, + "step": 1085 + }, + { + "epoch": 0.13815036254929397, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 1.6926627159118652, + "learning_rate": 4.599406528189911e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8586900234222412, + "num_tokens": 41419038.0, + "step": 1086 + }, + { + "epoch": 0.1382775728278845, + "ewc_loss": 2.175569534301758e-06, + "grad_norm": 1.691510796546936, + "learning_rate": 4.60364561254769e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8437085151672363, + "num_tokens": 41455767.0, + "step": 1087 + }, + { + "epoch": 0.138404783106475, + "ewc_loss": 2.175569534301758e-06, + "grad_norm": 1.6213970184326172, + "learning_rate": 4.607884696905468e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8463071584701538, + "num_tokens": 41497493.0, + "step": 1088 + }, + { + "epoch": 0.1385319933850655, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.6580250263214111, + "learning_rate": 4.612123781263247e-07, + "loss": 0.5074, + "mean_token_accuracy": 0.8367641568183899, + "num_tokens": 41542940.0, + "step": 1089 + }, + { + "epoch": 0.13865920366365603, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.710084080696106, + "learning_rate": 4.616362865621026e-07, + "loss": 0.407, + "mean_token_accuracy": 0.8663215041160583, + "num_tokens": 41579302.0, + "step": 1090 + }, + { + "epoch": 0.13878641394224653, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.909705638885498, + "learning_rate": 4.620601949978805e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8407723903656006, + "num_tokens": 41617555.0, + "step": 1091 + }, + { + "epoch": 0.13891362422083706, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 2.152987241744995, + "learning_rate": 4.6248410343365827e-07, + "loss": 0.5578, + "mean_token_accuracy": 0.8210397958755493, + "num_tokens": 41654660.0, + "step": 1092 + }, + { + "epoch": 0.13904083449942756, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.7430284023284912, + "learning_rate": 4.6290801186943617e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8486817479133606, + "num_tokens": 41693773.0, + "step": 1093 + }, + { + "epoch": 0.13916804477801806, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.9806594848632812, + "learning_rate": 4.6333192030521407e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.847690224647522, + "num_tokens": 41728864.0, + "step": 1094 + }, + { + "epoch": 0.13929525505660859, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 1.7112494707107544, + "learning_rate": 4.6375582874099196e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8532794713973999, + "num_tokens": 41770596.0, + "step": 1095 + }, + { + "epoch": 0.1394224653351991, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.9000037908554077, + "learning_rate": 4.6417973717676976e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8514699339866638, + "num_tokens": 41806585.0, + "step": 1096 + }, + { + "epoch": 0.1395496756137896, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.8180193901062012, + "learning_rate": 4.6460364561254766e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8386953473091125, + "num_tokens": 41845488.0, + "step": 1097 + }, + { + "epoch": 0.13967688589238011, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 16.608793258666992, + "learning_rate": 4.6502755404832556e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8480644226074219, + "num_tokens": 41880674.0, + "step": 1098 + }, + { + "epoch": 0.13980409617097062, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 2.0434601306915283, + "learning_rate": 4.654514624841034e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.8439309597015381, + "num_tokens": 41913020.0, + "step": 1099 + }, + { + "epoch": 0.13993130644956112, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.9292305707931519, + "learning_rate": 4.6587537091988125e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8444923162460327, + "num_tokens": 41948483.0, + "step": 1100 + }, + { + "epoch": 0.14005851672815164, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.7970260381698608, + "learning_rate": 4.6629927935565915e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.8527039289474487, + "num_tokens": 41985659.0, + "step": 1101 + }, + { + "epoch": 0.14018572700674214, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.8314448595046997, + "learning_rate": 4.6672318779143705e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8239681720733643, + "num_tokens": 42027746.0, + "step": 1102 + }, + { + "epoch": 0.14031293728533265, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.9252057075500488, + "learning_rate": 4.671470962272149e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8372527360916138, + "num_tokens": 42060903.0, + "step": 1103 + }, + { + "epoch": 0.14044014756392317, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.631222128868103, + "learning_rate": 4.6757100466299274e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8421086072921753, + "num_tokens": 42102722.0, + "step": 1104 + }, + { + "epoch": 0.14056735784251367, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.7792696952819824, + "learning_rate": 4.6799491309877064e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8330154418945312, + "num_tokens": 42143418.0, + "step": 1105 + }, + { + "epoch": 0.14069456812110417, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.807608962059021, + "learning_rate": 4.6841882153454854e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.8520330190658569, + "num_tokens": 42177313.0, + "step": 1106 + }, + { + "epoch": 0.1408217783996947, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.7947790622711182, + "learning_rate": 4.688427299703264e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8561891913414001, + "num_tokens": 42214158.0, + "step": 1107 + }, + { + "epoch": 0.1409489886782852, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.6432241201400757, + "learning_rate": 4.6926663840610423e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8429840803146362, + "num_tokens": 42254189.0, + "step": 1108 + }, + { + "epoch": 0.1410761989568757, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.8416740894317627, + "learning_rate": 4.6969054684188213e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8333684206008911, + "num_tokens": 42296853.0, + "step": 1109 + }, + { + "epoch": 0.14120340923546623, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.7801201343536377, + "learning_rate": 4.7011445527766003e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8413954377174377, + "num_tokens": 42331983.0, + "step": 1110 + }, + { + "epoch": 0.14133061951405673, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.820735216140747, + "learning_rate": 4.7053836371343787e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8426393866539001, + "num_tokens": 42370496.0, + "step": 1111 + }, + { + "epoch": 0.14145782979264723, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.7467007637023926, + "learning_rate": 4.709622721492157e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8496912121772766, + "num_tokens": 42406489.0, + "step": 1112 + }, + { + "epoch": 0.14158504007123776, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.8633720874786377, + "learning_rate": 4.713861805849936e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8464449644088745, + "num_tokens": 42440679.0, + "step": 1113 + }, + { + "epoch": 0.14171225034982826, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.8601583242416382, + "learning_rate": 4.718100890207715e-07, + "loss": 0.5425, + "mean_token_accuracy": 0.8262187242507935, + "num_tokens": 42476699.0, + "step": 1114 + }, + { + "epoch": 0.1418394606284188, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.9969792366027832, + "learning_rate": 4.7223399745654936e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8394891619682312, + "num_tokens": 42513348.0, + "step": 1115 + }, + { + "epoch": 0.1419666709070093, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.8089451789855957, + "learning_rate": 4.726579058923272e-07, + "loss": 0.4937, + "mean_token_accuracy": 0.8438923358917236, + "num_tokens": 42550633.0, + "step": 1116 + }, + { + "epoch": 0.1420938811855998, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.8851555585861206, + "learning_rate": 4.730818143281051e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8359503746032715, + "num_tokens": 42593649.0, + "step": 1117 + }, + { + "epoch": 0.14222109146419032, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.9194023609161377, + "learning_rate": 4.7350572276388295e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8497298359870911, + "num_tokens": 42632379.0, + "step": 1118 + }, + { + "epoch": 0.14234830174278082, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.8423278331756592, + "learning_rate": 4.7392963119966085e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8403245210647583, + "num_tokens": 42670319.0, + "step": 1119 + }, + { + "epoch": 0.14247551202137132, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.7277021408081055, + "learning_rate": 4.7435353963543875e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8474908471107483, + "num_tokens": 42712680.0, + "step": 1120 + }, + { + "epoch": 0.14260272229996185, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 2.1987972259521484, + "learning_rate": 4.747774480712166e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8427836298942566, + "num_tokens": 42748241.0, + "step": 1121 + }, + { + "epoch": 0.14272993257855235, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.8792608976364136, + "learning_rate": 4.7520135650699444e-07, + "loss": 0.5203, + "mean_token_accuracy": 0.8337138891220093, + "num_tokens": 42783208.0, + "step": 1122 + }, + { + "epoch": 0.14285714285714285, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.840922474861145, + "learning_rate": 4.7562526494277234e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.8425476551055908, + "num_tokens": 42818570.0, + "step": 1123 + }, + { + "epoch": 0.14298435313573338, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 2.631298780441284, + "learning_rate": 4.7604917337855024e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8572124242782593, + "num_tokens": 42852891.0, + "step": 1124 + }, + { + "epoch": 0.14311156341432388, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.8019993305206299, + "learning_rate": 4.764730818143281e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.838767945766449, + "num_tokens": 42887359.0, + "step": 1125 + }, + { + "epoch": 0.14323877369291438, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.7705070972442627, + "learning_rate": 4.768969902501059e-07, + "loss": 0.5356, + "mean_token_accuracy": 0.8289207816123962, + "num_tokens": 42929308.0, + "step": 1126 + }, + { + "epoch": 0.1433659839715049, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.6603245735168457, + "learning_rate": 4.773208986858838e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8408638834953308, + "num_tokens": 42969373.0, + "step": 1127 + }, + { + "epoch": 0.1434931942500954, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.7053252458572388, + "learning_rate": 4.777448071216617e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8422967195510864, + "num_tokens": 43010133.0, + "step": 1128 + }, + { + "epoch": 0.1436204045286859, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.6464694738388062, + "learning_rate": 4.781687155574396e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8345304727554321, + "num_tokens": 43053743.0, + "step": 1129 + }, + { + "epoch": 0.14374761480727644, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.6511114835739136, + "learning_rate": 4.785926239932175e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8523755669593811, + "num_tokens": 43094125.0, + "step": 1130 + }, + { + "epoch": 0.14387482508586694, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.9038151502609253, + "learning_rate": 4.790165324289953e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8439004421234131, + "num_tokens": 43132399.0, + "step": 1131 + }, + { + "epoch": 0.14400203536445744, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.8704429864883423, + "learning_rate": 4.794404408647732e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.8424618244171143, + "num_tokens": 43169036.0, + "step": 1132 + }, + { + "epoch": 0.14412924564304797, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.726815104484558, + "learning_rate": 4.798643493005511e-07, + "loss": 0.4312, + "mean_token_accuracy": 0.8591379523277283, + "num_tokens": 43209397.0, + "step": 1133 + }, + { + "epoch": 0.14425645592163847, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.8036633729934692, + "learning_rate": 4.80288257736329e-07, + "loss": 0.5258, + "mean_token_accuracy": 0.8344908356666565, + "num_tokens": 43246788.0, + "step": 1134 + }, + { + "epoch": 0.14438366620022897, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.7795113325119019, + "learning_rate": 4.807121661721068e-07, + "loss": 0.4392, + "mean_token_accuracy": 0.8582867383956909, + "num_tokens": 43282774.0, + "step": 1135 + }, + { + "epoch": 0.1445108764788195, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.7638152837753296, + "learning_rate": 4.811360746078847e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8448399901390076, + "num_tokens": 43322303.0, + "step": 1136 + }, + { + "epoch": 0.14463808675741, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 2.0921547412872314, + "learning_rate": 4.815599830436625e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.842851996421814, + "num_tokens": 43354611.0, + "step": 1137 + }, + { + "epoch": 0.1447652970360005, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.8640141487121582, + "learning_rate": 4.819838914794405e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8491564989089966, + "num_tokens": 43390640.0, + "step": 1138 + }, + { + "epoch": 0.14489250731459102, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.7579468488693237, + "learning_rate": 4.824077999152183e-07, + "loss": 0.489, + "mean_token_accuracy": 0.838638424873352, + "num_tokens": 43424513.0, + "step": 1139 + }, + { + "epoch": 0.14501971759318152, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.7752057313919067, + "learning_rate": 4.828317083509962e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8403985500335693, + "num_tokens": 43461699.0, + "step": 1140 + }, + { + "epoch": 0.14514692787177205, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.8818175792694092, + "learning_rate": 4.83255616786774e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.8526015281677246, + "num_tokens": 43494681.0, + "step": 1141 + }, + { + "epoch": 0.14527413815036255, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.786431908607483, + "learning_rate": 4.83679525222552e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.8552161455154419, + "num_tokens": 43531318.0, + "step": 1142 + }, + { + "epoch": 0.14540134842895305, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.7298431396484375, + "learning_rate": 4.841034336583298e-07, + "loss": 0.464, + "mean_token_accuracy": 0.855410635471344, + "num_tokens": 43569991.0, + "step": 1143 + }, + { + "epoch": 0.14552855870754358, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.789144515991211, + "learning_rate": 4.845273420941076e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8566313982009888, + "num_tokens": 43609978.0, + "step": 1144 + }, + { + "epoch": 0.14565576898613408, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.9957011938095093, + "learning_rate": 4.849512505298855e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8339269757270813, + "num_tokens": 43640419.0, + "step": 1145 + }, + { + "epoch": 0.14578297926472458, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.796173334121704, + "learning_rate": 4.853751589656634e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8508448600769043, + "num_tokens": 43682408.0, + "step": 1146 + }, + { + "epoch": 0.1459101895433151, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.7354185581207275, + "learning_rate": 4.857990674014413e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.84067302942276, + "num_tokens": 43726158.0, + "step": 1147 + }, + { + "epoch": 0.1460373998219056, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.8170894384384155, + "learning_rate": 4.862229758372191e-07, + "loss": 0.5258, + "mean_token_accuracy": 0.8289415836334229, + "num_tokens": 43766466.0, + "step": 1148 + }, + { + "epoch": 0.1461646101004961, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.7150239944458008, + "learning_rate": 4.86646884272997e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8323692083358765, + "num_tokens": 43807255.0, + "step": 1149 + }, + { + "epoch": 0.14629182037908664, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.7287558317184448, + "learning_rate": 4.870707927087749e-07, + "loss": 0.4937, + "mean_token_accuracy": 0.8407489657402039, + "num_tokens": 43845419.0, + "step": 1150 + }, + { + "epoch": 0.14641903065767714, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.7893342971801758, + "learning_rate": 4.874947011445528e-07, + "loss": 0.5, + "mean_token_accuracy": 0.841937243938446, + "num_tokens": 43882942.0, + "step": 1151 + }, + { + "epoch": 0.14654624093626764, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.6037507057189941, + "learning_rate": 4.879186095803306e-07, + "loss": 0.4248, + "mean_token_accuracy": 0.8626779317855835, + "num_tokens": 43923109.0, + "step": 1152 + }, + { + "epoch": 0.14667345121485817, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.7805211544036865, + "learning_rate": 4.883425180161085e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.850461483001709, + "num_tokens": 43959752.0, + "step": 1153 + }, + { + "epoch": 0.14680066149344867, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.774027943611145, + "learning_rate": 4.887664264518864e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8414218425750732, + "num_tokens": 43996247.0, + "step": 1154 + }, + { + "epoch": 0.14692787177203917, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.76215398311615, + "learning_rate": 4.891903348876643e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8449008464813232, + "num_tokens": 44035751.0, + "step": 1155 + }, + { + "epoch": 0.1470550820506297, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.7705883979797363, + "learning_rate": 4.896142433234421e-07, + "loss": 0.5243, + "mean_token_accuracy": 0.8385753035545349, + "num_tokens": 44073225.0, + "step": 1156 + }, + { + "epoch": 0.1471822923292202, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.7820249795913696, + "learning_rate": 4.9003815175922e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8374383449554443, + "num_tokens": 44111761.0, + "step": 1157 + }, + { + "epoch": 0.1473095026078107, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.8049585819244385, + "learning_rate": 4.904620601949979e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8451285362243652, + "num_tokens": 44151311.0, + "step": 1158 + }, + { + "epoch": 0.14743671288640123, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.0693013668060303, + "learning_rate": 4.908859686307758e-07, + "loss": 0.5207, + "mean_token_accuracy": 0.8363070487976074, + "num_tokens": 44184623.0, + "step": 1159 + }, + { + "epoch": 0.14756392316499173, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.9395933151245117, + "learning_rate": 4.913098770665536e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8406574726104736, + "num_tokens": 44221753.0, + "step": 1160 + }, + { + "epoch": 0.14769113344358223, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.7027143239974976, + "learning_rate": 4.917337855023314e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8474034070968628, + "num_tokens": 44262542.0, + "step": 1161 + }, + { + "epoch": 0.14781834372217276, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.7788174152374268, + "learning_rate": 4.921576939381094e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8482369780540466, + "num_tokens": 44300294.0, + "step": 1162 + }, + { + "epoch": 0.14794555400076326, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.821205496788025, + "learning_rate": 4.925816023738872e-07, + "loss": 0.5292, + "mean_token_accuracy": 0.8307841420173645, + "num_tokens": 44338476.0, + "step": 1163 + }, + { + "epoch": 0.14807276427935376, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.975510835647583, + "learning_rate": 4.930055108096651e-07, + "loss": 0.4286, + "mean_token_accuracy": 0.8618281483650208, + "num_tokens": 44367437.0, + "step": 1164 + }, + { + "epoch": 0.1481999745579443, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.8597767353057861, + "learning_rate": 4.934294192454429e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8266721963882446, + "num_tokens": 44402636.0, + "step": 1165 + }, + { + "epoch": 0.1483271848365348, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.7756463289260864, + "learning_rate": 4.938533276812209e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8450458645820618, + "num_tokens": 44438187.0, + "step": 1166 + }, + { + "epoch": 0.14845439511512531, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.8386863470077515, + "learning_rate": 4.942772361169987e-07, + "loss": 0.408, + "mean_token_accuracy": 0.8636008501052856, + "num_tokens": 44478345.0, + "step": 1167 + }, + { + "epoch": 0.14858160539371582, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.7081555128097534, + "learning_rate": 4.947011445527766e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8455220460891724, + "num_tokens": 44520272.0, + "step": 1168 + }, + { + "epoch": 0.14870881567230632, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.71371328830719, + "learning_rate": 4.951250529885544e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.8597403168678284, + "num_tokens": 44561058.0, + "step": 1169 + }, + { + "epoch": 0.14883602595089684, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.1863749027252197, + "learning_rate": 4.955489614243324e-07, + "loss": 0.5714, + "mean_token_accuracy": 0.8160808086395264, + "num_tokens": 44599371.0, + "step": 1170 + }, + { + "epoch": 0.14896323622948734, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.5922346115112305, + "learning_rate": 4.959728698601102e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8371139764785767, + "num_tokens": 44640499.0, + "step": 1171 + }, + { + "epoch": 0.14909044650807785, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.6431810855865479, + "learning_rate": 4.963967782958881e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8488808870315552, + "num_tokens": 44683164.0, + "step": 1172 + }, + { + "epoch": 0.14921765678666837, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.8598281145095825, + "learning_rate": 4.968206867316659e-07, + "loss": 0.4783, + "mean_token_accuracy": 0.8451050519943237, + "num_tokens": 44721938.0, + "step": 1173 + }, + { + "epoch": 0.14934486706525887, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.9743162393569946, + "learning_rate": 4.972445951674439e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8519638776779175, + "num_tokens": 44753506.0, + "step": 1174 + }, + { + "epoch": 0.14947207734384937, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 20.455493927001953, + "learning_rate": 4.976685036032216e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8259340524673462, + "num_tokens": 44792170.0, + "step": 1175 + }, + { + "epoch": 0.1495992876224399, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.7858363389968872, + "learning_rate": 4.980924120389996e-07, + "loss": 0.4226, + "mean_token_accuracy": 0.8646457195281982, + "num_tokens": 44830733.0, + "step": 1176 + }, + { + "epoch": 0.1497264979010304, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.8213300704956055, + "learning_rate": 4.985163204747774e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.859601616859436, + "num_tokens": 44869743.0, + "step": 1177 + }, + { + "epoch": 0.1498537081796209, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.8270212411880493, + "learning_rate": 4.989402289105554e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8506258726119995, + "num_tokens": 44913358.0, + "step": 1178 + }, + { + "epoch": 0.14998091845821143, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.0283632278442383, + "learning_rate": 4.993641373463331e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.8272973895072937, + "num_tokens": 44949963.0, + "step": 1179 + }, + { + "epoch": 0.15010812873680193, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.745640754699707, + "learning_rate": 4.997880457821111e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.8404513597488403, + "num_tokens": 44988569.0, + "step": 1180 + }, + { + "epoch": 0.15023533901539243, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.5905967950820923, + "learning_rate": 5.002119542178889e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.847190260887146, + "num_tokens": 45032634.0, + "step": 1181 + }, + { + "epoch": 0.15036254929398296, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.709046483039856, + "learning_rate": 5.006358626536667e-07, + "loss": 0.4422, + "mean_token_accuracy": 0.8578439354896545, + "num_tokens": 45069685.0, + "step": 1182 + }, + { + "epoch": 0.15048975957257346, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.771260380744934, + "learning_rate": 5.010597710894446e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8507376313209534, + "num_tokens": 45107481.0, + "step": 1183 + }, + { + "epoch": 0.15061696985116396, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.8338663578033447, + "learning_rate": 5.014836795252225e-07, + "loss": 0.4495, + "mean_token_accuracy": 0.8538128137588501, + "num_tokens": 45145747.0, + "step": 1184 + }, + { + "epoch": 0.1507441801297545, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.7999553680419922, + "learning_rate": 5.019075879610004e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8561285734176636, + "num_tokens": 45183140.0, + "step": 1185 + }, + { + "epoch": 0.150871390408345, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.7501194477081299, + "learning_rate": 5.023314963967783e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.841382622718811, + "num_tokens": 45224911.0, + "step": 1186 + }, + { + "epoch": 0.1509986006869355, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.8983830213546753, + "learning_rate": 5.027554048325562e-07, + "loss": 0.5419, + "mean_token_accuracy": 0.8273356556892395, + "num_tokens": 45257388.0, + "step": 1187 + }, + { + "epoch": 0.15112581096552602, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.7773096561431885, + "learning_rate": 5.03179313268334e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.83130943775177, + "num_tokens": 45297876.0, + "step": 1188 + }, + { + "epoch": 0.15125302124411652, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.890961766242981, + "learning_rate": 5.036032217041119e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8356335163116455, + "num_tokens": 45334040.0, + "step": 1189 + }, + { + "epoch": 0.15138023152270705, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.6714913845062256, + "learning_rate": 5.040271301398897e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8408868312835693, + "num_tokens": 45375985.0, + "step": 1190 + }, + { + "epoch": 0.15150744180129755, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.6745845079421997, + "learning_rate": 5.044510385756676e-07, + "loss": 0.4562, + "mean_token_accuracy": 0.8498647212982178, + "num_tokens": 45419951.0, + "step": 1191 + }, + { + "epoch": 0.15163465207988805, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.8117990493774414, + "learning_rate": 5.048749470114455e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8495277762413025, + "num_tokens": 45456988.0, + "step": 1192 + }, + { + "epoch": 0.15176186235847858, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.9909318685531616, + "learning_rate": 5.052988554472234e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8415358662605286, + "num_tokens": 45488226.0, + "step": 1193 + }, + { + "epoch": 0.15188907263706908, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.7582335472106934, + "learning_rate": 5.057227638830013e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.8442033529281616, + "num_tokens": 45524661.0, + "step": 1194 + }, + { + "epoch": 0.15201628291565958, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.9375993013381958, + "learning_rate": 5.061466723187792e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8435616493225098, + "num_tokens": 45563491.0, + "step": 1195 + }, + { + "epoch": 0.1521434931942501, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.9121869802474976, + "learning_rate": 5.065705807545569e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8529644012451172, + "num_tokens": 45596771.0, + "step": 1196 + }, + { + "epoch": 0.1522707034728406, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.6356462240219116, + "learning_rate": 5.069944891903349e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8456848859786987, + "num_tokens": 45640049.0, + "step": 1197 + }, + { + "epoch": 0.1523979137514311, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.853084921836853, + "learning_rate": 5.074183976261127e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8432008028030396, + "num_tokens": 45677375.0, + "step": 1198 + }, + { + "epoch": 0.15252512403002164, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.786065697669983, + "learning_rate": 5.078423060618906e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8466446399688721, + "num_tokens": 45719253.0, + "step": 1199 + }, + { + "epoch": 0.15265233430861214, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.824876070022583, + "learning_rate": 5.082662144976685e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8435364961624146, + "num_tokens": 45756065.0, + "step": 1200 + }, + { + "epoch": 0.15277954458720264, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.8288711309432983, + "learning_rate": 5.086901229334464e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8507923483848572, + "num_tokens": 45791707.0, + "step": 1201 + }, + { + "epoch": 0.15290675486579317, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.9992156028747559, + "learning_rate": 5.091140313692243e-07, + "loss": 0.4811, + "mean_token_accuracy": 0.8431586027145386, + "num_tokens": 45828464.0, + "step": 1202 + }, + { + "epoch": 0.15303396514438367, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.642188549041748, + "learning_rate": 5.095379398050022e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8487038016319275, + "num_tokens": 45867046.0, + "step": 1203 + }, + { + "epoch": 0.15316117542297417, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.83568274974823, + "learning_rate": 5.099618482407799e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8405511379241943, + "num_tokens": 45900530.0, + "step": 1204 + }, + { + "epoch": 0.1532883857015647, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.7748889923095703, + "learning_rate": 5.103857566765578e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.848675012588501, + "num_tokens": 45940002.0, + "step": 1205 + }, + { + "epoch": 0.1534155959801552, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 2.09824538230896, + "learning_rate": 5.108096651123357e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8593648672103882, + "num_tokens": 45975299.0, + "step": 1206 + }, + { + "epoch": 0.1535428062587457, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.8065623044967651, + "learning_rate": 5.112335735481135e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8459240794181824, + "num_tokens": 46012693.0, + "step": 1207 + }, + { + "epoch": 0.15367001653733622, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.8078185319900513, + "learning_rate": 5.116574819838915e-07, + "loss": 0.3837, + "mean_token_accuracy": 0.8766043186187744, + "num_tokens": 46051558.0, + "step": 1208 + }, + { + "epoch": 0.15379722681592672, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.8505184650421143, + "learning_rate": 5.120813904196693e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8524016737937927, + "num_tokens": 46087173.0, + "step": 1209 + }, + { + "epoch": 0.15392443709451722, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.9257192611694336, + "learning_rate": 5.125052988554473e-07, + "loss": 0.5204, + "mean_token_accuracy": 0.8356389403343201, + "num_tokens": 46120482.0, + "step": 1210 + }, + { + "epoch": 0.15405164737310775, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.788670301437378, + "learning_rate": 5.12929207291225e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8491483926773071, + "num_tokens": 46154478.0, + "step": 1211 + }, + { + "epoch": 0.15417885765169825, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.6315085887908936, + "learning_rate": 5.133531157270029e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8575376868247986, + "num_tokens": 46195012.0, + "step": 1212 + }, + { + "epoch": 0.15430606793028875, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.8932970762252808, + "learning_rate": 5.137770241627808e-07, + "loss": 0.4102, + "mean_token_accuracy": 0.8673381805419922, + "num_tokens": 46230641.0, + "step": 1213 + }, + { + "epoch": 0.15443327820887928, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.9133719205856323, + "learning_rate": 5.142009325985587e-07, + "loss": 0.4485, + "mean_token_accuracy": 0.8576677441596985, + "num_tokens": 46268993.0, + "step": 1214 + }, + { + "epoch": 0.15456048848746978, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.7176553010940552, + "learning_rate": 5.146248410343365e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8453654646873474, + "num_tokens": 46309310.0, + "step": 1215 + }, + { + "epoch": 0.1546876987660603, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.62175452709198, + "learning_rate": 5.150487494701145e-07, + "loss": 0.4231, + "mean_token_accuracy": 0.8618674874305725, + "num_tokens": 46352415.0, + "step": 1216 + }, + { + "epoch": 0.1548149090446508, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.7427972555160522, + "learning_rate": 5.154726579058923e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8455831408500671, + "num_tokens": 46387838.0, + "step": 1217 + }, + { + "epoch": 0.1549421193232413, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.559943675994873, + "learning_rate": 5.158965663416703e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.846977710723877, + "num_tokens": 46432669.0, + "step": 1218 + }, + { + "epoch": 0.15506932960183184, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.5969606637954712, + "learning_rate": 5.16320474777448e-07, + "loss": 0.4312, + "mean_token_accuracy": 0.8602331876754761, + "num_tokens": 46474383.0, + "step": 1219 + }, + { + "epoch": 0.15519653988042234, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.7775040864944458, + "learning_rate": 5.167443832132259e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.845136284828186, + "num_tokens": 46514263.0, + "step": 1220 + }, + { + "epoch": 0.15532375015901284, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.8278868198394775, + "learning_rate": 5.171682916490038e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8544783592224121, + "num_tokens": 46549835.0, + "step": 1221 + }, + { + "epoch": 0.15545096043760337, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.8300769329071045, + "learning_rate": 5.175922000847816e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.858802855014801, + "num_tokens": 46584811.0, + "step": 1222 + }, + { + "epoch": 0.15557817071619387, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 2.0658488273620605, + "learning_rate": 5.180161085205595e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8525986075401306, + "num_tokens": 46621441.0, + "step": 1223 + }, + { + "epoch": 0.15570538099478437, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.7919769287109375, + "learning_rate": 5.184400169563374e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8374493718147278, + "num_tokens": 46661572.0, + "step": 1224 + }, + { + "epoch": 0.1558325912733749, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.7130240201950073, + "learning_rate": 5.188639253921153e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8444839715957642, + "num_tokens": 46702157.0, + "step": 1225 + }, + { + "epoch": 0.1559598015519654, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.6875855922698975, + "learning_rate": 5.192878338278932e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8517680168151855, + "num_tokens": 46739119.0, + "step": 1226 + }, + { + "epoch": 0.1560870118305559, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.743152141571045, + "learning_rate": 5.19711742263671e-07, + "loss": 0.5173, + "mean_token_accuracy": 0.8343522548675537, + "num_tokens": 46783990.0, + "step": 1227 + }, + { + "epoch": 0.15621422210914643, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.8792152404785156, + "learning_rate": 5.201356506994488e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.8472940921783447, + "num_tokens": 46816131.0, + "step": 1228 + }, + { + "epoch": 0.15634143238773693, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.730146050453186, + "learning_rate": 5.205595591352268e-07, + "loss": 0.5373, + "mean_token_accuracy": 0.8276872038841248, + "num_tokens": 46856453.0, + "step": 1229 + }, + { + "epoch": 0.15646864266632743, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.6943871974945068, + "learning_rate": 5.209834675710046e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8468886613845825, + "num_tokens": 46897961.0, + "step": 1230 + }, + { + "epoch": 0.15659585294491796, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.9239894151687622, + "learning_rate": 5.214073760067825e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8255987167358398, + "num_tokens": 46931961.0, + "step": 1231 + }, + { + "epoch": 0.15672306322350846, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 1.8317064046859741, + "learning_rate": 5.218312844425604e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.8507897257804871, + "num_tokens": 46967822.0, + "step": 1232 + }, + { + "epoch": 0.15685027350209896, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 1.7626900672912598, + "learning_rate": 5.222551928783383e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8523968458175659, + "num_tokens": 47004511.0, + "step": 1233 + }, + { + "epoch": 0.1569774837806895, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 1.7462282180786133, + "learning_rate": 5.226791013141161e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.8391935229301453, + "num_tokens": 47047244.0, + "step": 1234 + }, + { + "epoch": 0.15710469405928, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 1.8016232252120972, + "learning_rate": 5.23103009749894e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8512589931488037, + "num_tokens": 47088206.0, + "step": 1235 + }, + { + "epoch": 0.1572319043378705, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 1.8038636445999146, + "learning_rate": 5.235269181856718e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8468335866928101, + "num_tokens": 47132733.0, + "step": 1236 + }, + { + "epoch": 0.15735911461646102, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 1.9742565155029297, + "learning_rate": 5.239508266214498e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.8283979296684265, + "num_tokens": 47166840.0, + "step": 1237 + }, + { + "epoch": 0.15748632489505152, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 1.801034927368164, + "learning_rate": 5.243747350572276e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8450037240982056, + "num_tokens": 47203444.0, + "step": 1238 + }, + { + "epoch": 0.15761353517364202, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.7590839862823486, + "learning_rate": 5.247986434930056e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8473242521286011, + "num_tokens": 47240380.0, + "step": 1239 + }, + { + "epoch": 0.15774074545223254, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.8392333984375, + "learning_rate": 5.252225519287834e-07, + "loss": 0.5422, + "mean_token_accuracy": 0.8268024921417236, + "num_tokens": 47277886.0, + "step": 1240 + }, + { + "epoch": 0.15786795573082305, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.8137168884277344, + "learning_rate": 5.256464603645613e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8513479232788086, + "num_tokens": 47315261.0, + "step": 1241 + }, + { + "epoch": 0.15799516600941357, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.6609042882919312, + "learning_rate": 5.260703688003391e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8473070859909058, + "num_tokens": 47357489.0, + "step": 1242 + }, + { + "epoch": 0.15812237628800407, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 2.0082905292510986, + "learning_rate": 5.26494277236117e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.8439321517944336, + "num_tokens": 47389851.0, + "step": 1243 + }, + { + "epoch": 0.15824958656659457, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.9201643466949463, + "learning_rate": 5.269181856718948e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8341805934906006, + "num_tokens": 47427491.0, + "step": 1244 + }, + { + "epoch": 0.1583767968451851, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.7652318477630615, + "learning_rate": 5.273420941076727e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8532031178474426, + "num_tokens": 47462790.0, + "step": 1245 + }, + { + "epoch": 0.1585040071237756, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.669905662536621, + "learning_rate": 5.277660025434506e-07, + "loss": 0.4221, + "mean_token_accuracy": 0.8633050322532654, + "num_tokens": 47502240.0, + "step": 1246 + }, + { + "epoch": 0.1586312174023661, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.7860147953033447, + "learning_rate": 5.281899109792285e-07, + "loss": 0.4453, + "mean_token_accuracy": 0.8554617166519165, + "num_tokens": 47542493.0, + "step": 1247 + }, + { + "epoch": 0.15875842768095663, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.77743661403656, + "learning_rate": 5.286138194150064e-07, + "loss": 0.4811, + "mean_token_accuracy": 0.8472805023193359, + "num_tokens": 47583378.0, + "step": 1248 + }, + { + "epoch": 0.15888563795954713, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.7470755577087402, + "learning_rate": 5.290377278507841e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.8428145051002502, + "num_tokens": 47622671.0, + "step": 1249 + }, + { + "epoch": 0.15901284823813763, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 2.0280237197875977, + "learning_rate": 5.294616362865621e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.854232907295227, + "num_tokens": 47663774.0, + "step": 1250 + }, + { + "epoch": 0.15914005851672816, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.7417163848876953, + "learning_rate": 5.298855447223399e-07, + "loss": 0.5353, + "mean_token_accuracy": 0.8380060195922852, + "num_tokens": 47706143.0, + "step": 1251 + }, + { + "epoch": 0.15926726879531866, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 1.854737639427185, + "learning_rate": 5.303094531581178e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8405086398124695, + "num_tokens": 47741197.0, + "step": 1252 + }, + { + "epoch": 0.15939447907390916, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 1.830969214439392, + "learning_rate": 5.307333615938957e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.8630145192146301, + "num_tokens": 47779306.0, + "step": 1253 + }, + { + "epoch": 0.1595216893524997, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 2.0100502967834473, + "learning_rate": 5.311572700296736e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.8372079133987427, + "num_tokens": 47811056.0, + "step": 1254 + }, + { + "epoch": 0.1596488996310902, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.8370808362960815, + "learning_rate": 5.315811784654515e-07, + "loss": 0.414, + "mean_token_accuracy": 0.8612129092216492, + "num_tokens": 47847075.0, + "step": 1255 + }, + { + "epoch": 0.1597761099096807, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.7018942832946777, + "learning_rate": 5.320050869012294e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8484866619110107, + "num_tokens": 47892639.0, + "step": 1256 + }, + { + "epoch": 0.15990332018827122, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.7463302612304688, + "learning_rate": 5.324289953370071e-07, + "loss": 0.5278, + "mean_token_accuracy": 0.8338376879692078, + "num_tokens": 47933522.0, + "step": 1257 + }, + { + "epoch": 0.16003053046686172, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.7052702903747559, + "learning_rate": 5.328529037727851e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8493366241455078, + "num_tokens": 47977307.0, + "step": 1258 + }, + { + "epoch": 0.16015774074545222, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.7849550247192383, + "learning_rate": 5.332768122085629e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8499162793159485, + "num_tokens": 48015129.0, + "step": 1259 + }, + { + "epoch": 0.16028495102404275, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 2.03383731842041, + "learning_rate": 5.337007206443408e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8389720320701599, + "num_tokens": 48051904.0, + "step": 1260 + }, + { + "epoch": 0.16041216130263325, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.7772196531295776, + "learning_rate": 5.341246290801187e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8555932641029358, + "num_tokens": 48094001.0, + "step": 1261 + }, + { + "epoch": 0.16053937158122375, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.6549632549285889, + "learning_rate": 5.345485375158966e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8517481088638306, + "num_tokens": 48134124.0, + "step": 1262 + }, + { + "epoch": 0.16066658185981428, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.8029924631118774, + "learning_rate": 5.349724459516745e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8562151789665222, + "num_tokens": 48170730.0, + "step": 1263 + }, + { + "epoch": 0.16079379213840478, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 2.044513463973999, + "learning_rate": 5.353963543874522e-07, + "loss": 0.5113, + "mean_token_accuracy": 0.837269127368927, + "num_tokens": 48212640.0, + "step": 1264 + }, + { + "epoch": 0.1609210024169953, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 2.0243656635284424, + "learning_rate": 5.358202628232301e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8447730541229248, + "num_tokens": 48243136.0, + "step": 1265 + }, + { + "epoch": 0.1610482126955858, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 1.8003805875778198, + "learning_rate": 5.36244171259008e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8488349914550781, + "num_tokens": 48282905.0, + "step": 1266 + }, + { + "epoch": 0.1611754229741763, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 1.7928589582443237, + "learning_rate": 5.366680796947859e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8396742343902588, + "num_tokens": 48323325.0, + "step": 1267 + }, + { + "epoch": 0.16130263325276684, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 1.9990688562393188, + "learning_rate": 5.370919881305637e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8474347591400146, + "num_tokens": 48358400.0, + "step": 1268 + }, + { + "epoch": 0.16142984353135734, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 2.4383416175842285, + "learning_rate": 5.375158965663417e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8466840982437134, + "num_tokens": 48393018.0, + "step": 1269 + }, + { + "epoch": 0.16155705380994784, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 2.0369153022766113, + "learning_rate": 5.379398050021195e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8472998738288879, + "num_tokens": 48432952.0, + "step": 1270 + }, + { + "epoch": 0.16168426408853837, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.972045660018921, + "learning_rate": 5.383637134378975e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8490458130836487, + "num_tokens": 48466913.0, + "step": 1271 + }, + { + "epoch": 0.16181147436712887, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.833059310913086, + "learning_rate": 5.387876218736752e-07, + "loss": 0.5417, + "mean_token_accuracy": 0.8245713114738464, + "num_tokens": 48512658.0, + "step": 1272 + }, + { + "epoch": 0.16193868464571937, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.6922568082809448, + "learning_rate": 5.392115303094531e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8563932180404663, + "num_tokens": 48552283.0, + "step": 1273 + }, + { + "epoch": 0.1620658949243099, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.7741590738296509, + "learning_rate": 5.39635438745231e-07, + "loss": 0.4183, + "mean_token_accuracy": 0.863897442817688, + "num_tokens": 48590504.0, + "step": 1274 + }, + { + "epoch": 0.1621931052029004, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.9457688331604004, + "learning_rate": 5.400593471810089e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8368320465087891, + "num_tokens": 48629993.0, + "step": 1275 + }, + { + "epoch": 0.1623203154814909, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 2.015479564666748, + "learning_rate": 5.404832556167867e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8378725051879883, + "num_tokens": 48665074.0, + "step": 1276 + }, + { + "epoch": 0.16244752576008142, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.8353426456451416, + "learning_rate": 5.409071640525647e-07, + "loss": 0.4689, + "mean_token_accuracy": 0.851847767829895, + "num_tokens": 48705524.0, + "step": 1277 + }, + { + "epoch": 0.16257473603867192, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 1.8513716459274292, + "learning_rate": 5.413310724883425e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.8281844854354858, + "num_tokens": 48744030.0, + "step": 1278 + }, + { + "epoch": 0.16270194631726242, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 1.7769922018051147, + "learning_rate": 5.417549809241205e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8421943187713623, + "num_tokens": 48788401.0, + "step": 1279 + }, + { + "epoch": 0.16282915659585295, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.9639750719070435, + "learning_rate": 5.421788893598982e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.8374066948890686, + "num_tokens": 48823046.0, + "step": 1280 + }, + { + "epoch": 0.16295636687444345, + "ewc_loss": 2.5480985641479492e-06, + "grad_norm": 1.6967805624008179, + "learning_rate": 5.42602797795676e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.834228515625, + "num_tokens": 48868293.0, + "step": 1281 + }, + { + "epoch": 0.16308357715303395, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 1.8416239023208618, + "learning_rate": 5.43026706231454e-07, + "loss": 0.5432, + "mean_token_accuracy": 0.8256669640541077, + "num_tokens": 48908775.0, + "step": 1282 + }, + { + "epoch": 0.16321078743162448, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.8371260166168213, + "learning_rate": 5.434506146672319e-07, + "loss": 0.5218, + "mean_token_accuracy": 0.8329086303710938, + "num_tokens": 48957640.0, + "step": 1283 + }, + { + "epoch": 0.16333799771021498, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.8455603122711182, + "learning_rate": 5.438745231030097e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8331407308578491, + "num_tokens": 48993601.0, + "step": 1284 + }, + { + "epoch": 0.16346520798880548, + "ewc_loss": 2.5480985641479492e-06, + "grad_norm": 1.8463908433914185, + "learning_rate": 5.442984315387876e-07, + "loss": 0.4174, + "mean_token_accuracy": 0.8631255030632019, + "num_tokens": 49028411.0, + "step": 1285 + }, + { + "epoch": 0.163592418267396, + "ewc_loss": 2.5480985641479492e-06, + "grad_norm": 1.6765544414520264, + "learning_rate": 5.447223399745655e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8492662906646729, + "num_tokens": 49067449.0, + "step": 1286 + }, + { + "epoch": 0.1637196285459865, + "ewc_loss": 2.5480985641479492e-06, + "grad_norm": 1.9085454940795898, + "learning_rate": 5.451462484103433e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8534442782402039, + "num_tokens": 49102650.0, + "step": 1287 + }, + { + "epoch": 0.163846838824577, + "ewc_loss": 2.5480985641479492e-06, + "grad_norm": 1.8535741567611694, + "learning_rate": 5.455701568461212e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.845105767250061, + "num_tokens": 49137110.0, + "step": 1288 + }, + { + "epoch": 0.16397404910316754, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.8439160585403442, + "learning_rate": 5.45994065281899e-07, + "loss": 0.5519, + "mean_token_accuracy": 0.8216842412948608, + "num_tokens": 49177765.0, + "step": 1289 + }, + { + "epoch": 0.16410125938175804, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.965736985206604, + "learning_rate": 5.46417973717677e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8396299481391907, + "num_tokens": 49207860.0, + "step": 1290 + }, + { + "epoch": 0.16422846966034857, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.0245201587677, + "learning_rate": 5.468418821534548e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8610696792602539, + "num_tokens": 49239994.0, + "step": 1291 + }, + { + "epoch": 0.16435567993893907, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.8773537874221802, + "learning_rate": 5.472657905892327e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8579833507537842, + "num_tokens": 49273464.0, + "step": 1292 + }, + { + "epoch": 0.16448289021752957, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.8739935159683228, + "learning_rate": 5.476896990250106e-07, + "loss": 0.5539, + "mean_token_accuracy": 0.8240504264831543, + "num_tokens": 49313144.0, + "step": 1293 + }, + { + "epoch": 0.1646101004961201, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.7052232027053833, + "learning_rate": 5.481136074607885e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8537378907203674, + "num_tokens": 49351392.0, + "step": 1294 + }, + { + "epoch": 0.1647373107747106, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.617052674293518, + "learning_rate": 5.485375158965663e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8565340042114258, + "num_tokens": 49396726.0, + "step": 1295 + }, + { + "epoch": 0.1648645210533011, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.728806495666504, + "learning_rate": 5.489614243323442e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8453445434570312, + "num_tokens": 49435856.0, + "step": 1296 + }, + { + "epoch": 0.16499173133189163, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.647409200668335, + "learning_rate": 5.49385332768122e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8616434335708618, + "num_tokens": 49473695.0, + "step": 1297 + }, + { + "epoch": 0.16511894161048213, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.9186419248580933, + "learning_rate": 5.498092412039e-07, + "loss": 0.5432, + "mean_token_accuracy": 0.8316320776939392, + "num_tokens": 49509165.0, + "step": 1298 + }, + { + "epoch": 0.16524615188907263, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.5882105827331543, + "learning_rate": 5.502331496396778e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.8542036414146423, + "num_tokens": 49553790.0, + "step": 1299 + }, + { + "epoch": 0.16537336216766316, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.7696946859359741, + "learning_rate": 5.506570580754557e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.847856879234314, + "num_tokens": 49593124.0, + "step": 1300 + }, + { + "epoch": 0.16550057244625366, + "ewc_loss": 2.5779008865356445e-06, + "grad_norm": 1.8418354988098145, + "learning_rate": 5.510809665112336e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.854796826839447, + "num_tokens": 49632016.0, + "step": 1301 + }, + { + "epoch": 0.16562778272484416, + "ewc_loss": 2.5779008865356445e-06, + "grad_norm": 1.734005331993103, + "learning_rate": 5.515048749470113e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8485583066940308, + "num_tokens": 49670689.0, + "step": 1302 + }, + { + "epoch": 0.1657549930034347, + "ewc_loss": 2.5779008865356445e-06, + "grad_norm": 1.8761935234069824, + "learning_rate": 5.519287833827893e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.8505444526672363, + "num_tokens": 49709230.0, + "step": 1303 + }, + { + "epoch": 0.1658822032820252, + "ewc_loss": 2.5779008865356445e-06, + "grad_norm": 2.137111186981201, + "learning_rate": 5.523526918185671e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8347648978233337, + "num_tokens": 49741648.0, + "step": 1304 + }, + { + "epoch": 0.1660094135606157, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 1.9105558395385742, + "learning_rate": 5.52776600254345e-07, + "loss": 0.4348, + "mean_token_accuracy": 0.8549038767814636, + "num_tokens": 49773906.0, + "step": 1305 + }, + { + "epoch": 0.16613662383920622, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 1.8019227981567383, + "learning_rate": 5.532005086901229e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8457106351852417, + "num_tokens": 49812645.0, + "step": 1306 + }, + { + "epoch": 0.16626383411779672, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.6387290954589844, + "learning_rate": 5.536244171259008e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8521769046783447, + "num_tokens": 49854500.0, + "step": 1307 + }, + { + "epoch": 0.16639104439638722, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 1.9802005290985107, + "learning_rate": 5.540483255616786e-07, + "loss": 0.5566, + "mean_token_accuracy": 0.8248774409294128, + "num_tokens": 49889408.0, + "step": 1308 + }, + { + "epoch": 0.16651825467497774, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 1.9345866441726685, + "learning_rate": 5.544722339974566e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8474600315093994, + "num_tokens": 49923581.0, + "step": 1309 + }, + { + "epoch": 0.16664546495356825, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 1.7823941707611084, + "learning_rate": 5.548961424332343e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8475217223167419, + "num_tokens": 49959433.0, + "step": 1310 + }, + { + "epoch": 0.16677267523215875, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 1.934800624847412, + "learning_rate": 5.553200508690123e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8607295751571655, + "num_tokens": 50000307.0, + "step": 1311 + }, + { + "epoch": 0.16689988551074927, + "ewc_loss": 2.5779008865356445e-06, + "grad_norm": 1.8288649320602417, + "learning_rate": 5.557439593047901e-07, + "loss": 0.528, + "mean_token_accuracy": 0.8297667503356934, + "num_tokens": 50035587.0, + "step": 1312 + }, + { + "epoch": 0.16702709578933977, + "ewc_loss": 2.5779008865356445e-06, + "grad_norm": 1.6659003496170044, + "learning_rate": 5.56167867740568e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8491032123565674, + "num_tokens": 50079425.0, + "step": 1313 + }, + { + "epoch": 0.16715430606793028, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 1.8579092025756836, + "learning_rate": 5.565917761763459e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8551030158996582, + "num_tokens": 50110674.0, + "step": 1314 + }, + { + "epoch": 0.1672815163465208, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 1.772100567817688, + "learning_rate": 5.570156846121238e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8432843685150146, + "num_tokens": 50149033.0, + "step": 1315 + }, + { + "epoch": 0.1674087266251113, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.7764421701431274, + "learning_rate": 5.574395930479016e-07, + "loss": 0.5105, + "mean_token_accuracy": 0.8380846977233887, + "num_tokens": 50185728.0, + "step": 1316 + }, + { + "epoch": 0.16753593690370183, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.856794834136963, + "learning_rate": 5.578635014836796e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.8521615266799927, + "num_tokens": 50219671.0, + "step": 1317 + }, + { + "epoch": 0.16766314718229233, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 2.035811185836792, + "learning_rate": 5.582874099194573e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8373550176620483, + "num_tokens": 50251989.0, + "step": 1318 + }, + { + "epoch": 0.16779035746088283, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.8577004671096802, + "learning_rate": 5.587113183552353e-07, + "loss": 0.5037, + "mean_token_accuracy": 0.8391120433807373, + "num_tokens": 50291079.0, + "step": 1319 + }, + { + "epoch": 0.16791756773947336, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.7221099138259888, + "learning_rate": 5.591352267910131e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8508173227310181, + "num_tokens": 50332262.0, + "step": 1320 + }, + { + "epoch": 0.16804477801806386, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 2.0881457328796387, + "learning_rate": 5.59559135226791e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8454287052154541, + "num_tokens": 50371278.0, + "step": 1321 + }, + { + "epoch": 0.16817198829665436, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.9408022165298462, + "learning_rate": 5.599830436625689e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8512154221534729, + "num_tokens": 50409463.0, + "step": 1322 + }, + { + "epoch": 0.1682991985752449, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.7901636362075806, + "learning_rate": 5.604069520983468e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8529828190803528, + "num_tokens": 50446745.0, + "step": 1323 + }, + { + "epoch": 0.1684264088538354, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 1.8532649278640747, + "learning_rate": 5.608308605341246e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8515172600746155, + "num_tokens": 50479725.0, + "step": 1324 + }, + { + "epoch": 0.1685536191324259, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 2.1207735538482666, + "learning_rate": 5.612547689699024e-07, + "loss": 0.5162, + "mean_token_accuracy": 0.8353087902069092, + "num_tokens": 50514734.0, + "step": 1325 + }, + { + "epoch": 0.16868082941101642, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 1.7335084676742554, + "learning_rate": 5.616786774056803e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8438496589660645, + "num_tokens": 50555165.0, + "step": 1326 + }, + { + "epoch": 0.16880803968960692, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.817690372467041, + "learning_rate": 5.621025858414582e-07, + "loss": 0.4963, + "mean_token_accuracy": 0.8407958149909973, + "num_tokens": 50598251.0, + "step": 1327 + }, + { + "epoch": 0.16893524996819742, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 1.6615275144577026, + "learning_rate": 5.625264942772361e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.8453807830810547, + "num_tokens": 50638423.0, + "step": 1328 + }, + { + "epoch": 0.16906246024678795, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 1.7083103656768799, + "learning_rate": 5.629504027130139e-07, + "loss": 0.5436, + "mean_token_accuracy": 0.835297703742981, + "num_tokens": 50678226.0, + "step": 1329 + }, + { + "epoch": 0.16918967052537845, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 1.6688276529312134, + "learning_rate": 5.633743111487919e-07, + "loss": 0.5449, + "mean_token_accuracy": 0.8247915506362915, + "num_tokens": 50717542.0, + "step": 1330 + }, + { + "epoch": 0.16931688080396895, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 1.8275858163833618, + "learning_rate": 5.637982195845697e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8610815405845642, + "num_tokens": 50752914.0, + "step": 1331 + }, + { + "epoch": 0.16944409108255948, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 1.7516525983810425, + "learning_rate": 5.642221280203476e-07, + "loss": 0.4266, + "mean_token_accuracy": 0.8643187284469604, + "num_tokens": 50789276.0, + "step": 1332 + }, + { + "epoch": 0.16957130136114998, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 2.0800907611846924, + "learning_rate": 5.646460364561254e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.8557742238044739, + "num_tokens": 50820759.0, + "step": 1333 + }, + { + "epoch": 0.16969851163974048, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.8392789363861084, + "learning_rate": 5.650699448919033e-07, + "loss": 0.536, + "mean_token_accuracy": 0.8291994333267212, + "num_tokens": 50863241.0, + "step": 1334 + }, + { + "epoch": 0.169825721918331, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.9268933534622192, + "learning_rate": 5.654938533276812e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.8464697003364563, + "num_tokens": 50904296.0, + "step": 1335 + }, + { + "epoch": 0.1699529321969215, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.8122638463974, + "learning_rate": 5.659177617634591e-07, + "loss": 0.464, + "mean_token_accuracy": 0.851891040802002, + "num_tokens": 50942180.0, + "step": 1336 + }, + { + "epoch": 0.170080142475512, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.7835240364074707, + "learning_rate": 5.663416701992369e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.844269871711731, + "num_tokens": 50980287.0, + "step": 1337 + }, + { + "epoch": 0.17020735275410254, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.8730236291885376, + "learning_rate": 5.667655786350149e-07, + "loss": 0.5473, + "mean_token_accuracy": 0.8199698328971863, + "num_tokens": 51014218.0, + "step": 1338 + }, + { + "epoch": 0.17033456303269304, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.6455084085464478, + "learning_rate": 5.671894870707927e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8526039719581604, + "num_tokens": 51055915.0, + "step": 1339 + }, + { + "epoch": 0.17046177331128357, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 1.6790618896484375, + "learning_rate": 5.676133955065705e-07, + "loss": 0.4143, + "mean_token_accuracy": 0.8648375868797302, + "num_tokens": 51090684.0, + "step": 1340 + }, + { + "epoch": 0.17058898358987407, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 1.740286111831665, + "learning_rate": 5.680373039423484e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.8526129126548767, + "num_tokens": 51130156.0, + "step": 1341 + }, + { + "epoch": 0.17071619386846457, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 1.8021087646484375, + "learning_rate": 5.684612123781263e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8393697738647461, + "num_tokens": 51171282.0, + "step": 1342 + }, + { + "epoch": 0.1708434041470551, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 1.7880945205688477, + "learning_rate": 5.688851208139042e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.8337036371231079, + "num_tokens": 51210616.0, + "step": 1343 + }, + { + "epoch": 0.1709706144256456, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 2.1911439895629883, + "learning_rate": 5.69309029249682e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8441581726074219, + "num_tokens": 51253357.0, + "step": 1344 + }, + { + "epoch": 0.1710978247042361, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.6814043521881104, + "learning_rate": 5.697329376854599e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8449721336364746, + "num_tokens": 51292671.0, + "step": 1345 + }, + { + "epoch": 0.17122503498282662, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.8280751705169678, + "learning_rate": 5.701568461212378e-07, + "loss": 0.4215, + "mean_token_accuracy": 0.8643792867660522, + "num_tokens": 51329145.0, + "step": 1346 + }, + { + "epoch": 0.17135224526141712, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.6168032884597778, + "learning_rate": 5.705807545570157e-07, + "loss": 0.4312, + "mean_token_accuracy": 0.8602039217948914, + "num_tokens": 51371265.0, + "step": 1347 + }, + { + "epoch": 0.17147945554000762, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.8513340950012207, + "learning_rate": 5.710046629927934e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8486565351486206, + "num_tokens": 51410262.0, + "step": 1348 + }, + { + "epoch": 0.17160666581859815, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.6821917295455933, + "learning_rate": 5.714285714285714e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.8588300943374634, + "num_tokens": 51447796.0, + "step": 1349 + }, + { + "epoch": 0.17173387609718865, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 2.2009589672088623, + "learning_rate": 5.718524798643492e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8409037590026855, + "num_tokens": 51482677.0, + "step": 1350 + }, + { + "epoch": 0.17186108637577915, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 1.8108867406845093, + "learning_rate": 5.722763883001272e-07, + "loss": 0.5031, + "mean_token_accuracy": 0.8412790298461914, + "num_tokens": 51524111.0, + "step": 1351 + }, + { + "epoch": 0.17198829665436968, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.7778196334838867, + "learning_rate": 5.72700296735905e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8463330268859863, + "num_tokens": 51561008.0, + "step": 1352 + }, + { + "epoch": 0.17211550693296018, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.8496437072753906, + "learning_rate": 5.731242051716829e-07, + "loss": 0.53, + "mean_token_accuracy": 0.8286532759666443, + "num_tokens": 51603091.0, + "step": 1353 + }, + { + "epoch": 0.17224271721155068, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.8233813047409058, + "learning_rate": 5.735481136074608e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8540576100349426, + "num_tokens": 51637789.0, + "step": 1354 + }, + { + "epoch": 0.1723699274901412, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.8655740022659302, + "learning_rate": 5.739720220432386e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8516846895217896, + "num_tokens": 51674207.0, + "step": 1355 + }, + { + "epoch": 0.1724971377687317, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.7352057695388794, + "learning_rate": 5.743959304790164e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8295516967773438, + "num_tokens": 51715109.0, + "step": 1356 + }, + { + "epoch": 0.1726243480473222, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.8122611045837402, + "learning_rate": 5.748198389147944e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8443707227706909, + "num_tokens": 51753373.0, + "step": 1357 + }, + { + "epoch": 0.17275155832591274, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.7378296852111816, + "learning_rate": 5.752437473505722e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.835789680480957, + "num_tokens": 51795100.0, + "step": 1358 + }, + { + "epoch": 0.17287876860450324, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.9135314226150513, + "learning_rate": 5.756676557863502e-07, + "loss": 0.5016, + "mean_token_accuracy": 0.8384497165679932, + "num_tokens": 51831840.0, + "step": 1359 + }, + { + "epoch": 0.17300597888309374, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.8159767389297485, + "learning_rate": 5.76091564222128e-07, + "loss": 0.455, + "mean_token_accuracy": 0.8549122214317322, + "num_tokens": 51870227.0, + "step": 1360 + }, + { + "epoch": 0.17313318916168427, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.823794960975647, + "learning_rate": 5.765154726579059e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8475700616836548, + "num_tokens": 51908251.0, + "step": 1361 + }, + { + "epoch": 0.17326039944027477, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.8980566263198853, + "learning_rate": 5.769393810936838e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8488314151763916, + "num_tokens": 51944531.0, + "step": 1362 + }, + { + "epoch": 0.17338760971886527, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.8041245937347412, + "learning_rate": 5.773632895294616e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.8325759768486023, + "num_tokens": 51985742.0, + "step": 1363 + }, + { + "epoch": 0.1735148199974558, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.8756662607192993, + "learning_rate": 5.777871979652394e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8518168926239014, + "num_tokens": 52022036.0, + "step": 1364 + }, + { + "epoch": 0.1736420302760463, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.6427260637283325, + "learning_rate": 5.782111064010173e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.843558669090271, + "num_tokens": 52066077.0, + "step": 1365 + }, + { + "epoch": 0.17376924055463683, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.6121870279312134, + "learning_rate": 5.786350148367952e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8480254411697388, + "num_tokens": 52111280.0, + "step": 1366 + }, + { + "epoch": 0.17389645083322733, + "ewc_loss": 2.7418136596679688e-06, + "grad_norm": 1.703250527381897, + "learning_rate": 5.790589232725731e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8463023900985718, + "num_tokens": 52153148.0, + "step": 1367 + }, + { + "epoch": 0.17402366111181783, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.8685263395309448, + "learning_rate": 5.79482831708351e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8517332673072815, + "num_tokens": 52186722.0, + "step": 1368 + }, + { + "epoch": 0.17415087139040836, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 2.023048162460327, + "learning_rate": 5.799067401441288e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8473957777023315, + "num_tokens": 52220837.0, + "step": 1369 + }, + { + "epoch": 0.17427808166899886, + "ewc_loss": 2.7418136596679688e-06, + "grad_norm": 1.8529322147369385, + "learning_rate": 5.803306485799068e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8549777269363403, + "num_tokens": 52257049.0, + "step": 1370 + }, + { + "epoch": 0.17440529194758936, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 1.708536982536316, + "learning_rate": 5.807545570156845e-07, + "loss": 0.433, + "mean_token_accuracy": 0.859043300151825, + "num_tokens": 52297632.0, + "step": 1371 + }, + { + "epoch": 0.1745325022261799, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.965267300605774, + "learning_rate": 5.811784654514624e-07, + "loss": 0.5273, + "mean_token_accuracy": 0.831832766532898, + "num_tokens": 52331008.0, + "step": 1372 + }, + { + "epoch": 0.1746597125047704, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.735725998878479, + "learning_rate": 5.816023738872403e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8528362512588501, + "num_tokens": 52369420.0, + "step": 1373 + }, + { + "epoch": 0.1747869227833609, + "ewc_loss": 2.7418136596679688e-06, + "grad_norm": 1.8266148567199707, + "learning_rate": 5.820262823230182e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.842530369758606, + "num_tokens": 52411759.0, + "step": 1374 + }, + { + "epoch": 0.17491413306195142, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.9533008337020874, + "learning_rate": 5.824501907587961e-07, + "loss": 0.451, + "mean_token_accuracy": 0.8541964888572693, + "num_tokens": 52446459.0, + "step": 1375 + }, + { + "epoch": 0.17504134334054192, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.765584945678711, + "learning_rate": 5.82874099194574e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.828734815120697, + "num_tokens": 52484615.0, + "step": 1376 + }, + { + "epoch": 0.17516855361913242, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.7234243154525757, + "learning_rate": 5.832980076303518e-07, + "loss": 0.544, + "mean_token_accuracy": 0.8258804082870483, + "num_tokens": 52525195.0, + "step": 1377 + }, + { + "epoch": 0.17529576389772294, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 1.885110855102539, + "learning_rate": 5.837219160661297e-07, + "loss": 0.4322, + "mean_token_accuracy": 0.858909010887146, + "num_tokens": 52558783.0, + "step": 1378 + }, + { + "epoch": 0.17542297417631345, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 1.7247868776321411, + "learning_rate": 5.841458245019075e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.861274003982544, + "num_tokens": 52599176.0, + "step": 1379 + }, + { + "epoch": 0.17555018445490395, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.9882714748382568, + "learning_rate": 5.845697329376855e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.848886251449585, + "num_tokens": 52630865.0, + "step": 1380 + }, + { + "epoch": 0.17567739473349447, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 2.8247506618499756, + "learning_rate": 5.849936413734633e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.8302857875823975, + "num_tokens": 52671702.0, + "step": 1381 + }, + { + "epoch": 0.17580460501208497, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.8222246170043945, + "learning_rate": 5.854175498092412e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.8590050339698792, + "num_tokens": 52712288.0, + "step": 1382 + }, + { + "epoch": 0.17593181529067548, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.6350984573364258, + "learning_rate": 5.858414582450191e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.8469899892807007, + "num_tokens": 52756569.0, + "step": 1383 + }, + { + "epoch": 0.176059025569266, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.7238974571228027, + "learning_rate": 5.86265366680797e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8402450680732727, + "num_tokens": 52796195.0, + "step": 1384 + }, + { + "epoch": 0.1761862358478565, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.6760319471359253, + "learning_rate": 5.866892751165748e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8491325378417969, + "num_tokens": 52839687.0, + "step": 1385 + }, + { + "epoch": 0.176313446126447, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.8871312141418457, + "learning_rate": 5.871131835523526e-07, + "loss": 0.5288, + "mean_token_accuracy": 0.8315557241439819, + "num_tokens": 52875660.0, + "step": 1386 + }, + { + "epoch": 0.17644065640503753, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.6484228372573853, + "learning_rate": 5.875370919881305e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8440538048744202, + "num_tokens": 52921220.0, + "step": 1387 + }, + { + "epoch": 0.17656786668362803, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.8210368156433105, + "learning_rate": 5.879610004239084e-07, + "loss": 0.5374, + "mean_token_accuracy": 0.8323879837989807, + "num_tokens": 52960042.0, + "step": 1388 + }, + { + "epoch": 0.17669507696221853, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.8418245315551758, + "learning_rate": 5.883849088596863e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.8355681896209717, + "num_tokens": 53002817.0, + "step": 1389 + }, + { + "epoch": 0.17682228724080906, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.846777081489563, + "learning_rate": 5.888088172954641e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8450039625167847, + "num_tokens": 53037751.0, + "step": 1390 + }, + { + "epoch": 0.17694949751939956, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.8217155933380127, + "learning_rate": 5.892327257312421e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.8679434061050415, + "num_tokens": 53074286.0, + "step": 1391 + }, + { + "epoch": 0.1770767077979901, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.757007360458374, + "learning_rate": 5.896566341670199e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8565359115600586, + "num_tokens": 53113525.0, + "step": 1392 + }, + { + "epoch": 0.1772039180765806, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.9652462005615234, + "learning_rate": 5.900805426027977e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8472298383712769, + "num_tokens": 53150137.0, + "step": 1393 + }, + { + "epoch": 0.1773311283551711, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.7647814750671387, + "learning_rate": 5.905044510385756e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.8604317307472229, + "num_tokens": 53189519.0, + "step": 1394 + }, + { + "epoch": 0.17745833863376162, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.7704781293869019, + "learning_rate": 5.909283594743535e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.8292432427406311, + "num_tokens": 53227452.0, + "step": 1395 + }, + { + "epoch": 0.17758554891235212, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.7101178169250488, + "learning_rate": 5.913522679101314e-07, + "loss": 0.4689, + "mean_token_accuracy": 0.8463853597640991, + "num_tokens": 53265875.0, + "step": 1396 + }, + { + "epoch": 0.17771275919094262, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.7475796937942505, + "learning_rate": 5.917761763459093e-07, + "loss": 0.5285, + "mean_token_accuracy": 0.8312379717826843, + "num_tokens": 53305477.0, + "step": 1397 + }, + { + "epoch": 0.17783996946953315, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.6716939210891724, + "learning_rate": 5.922000847816871e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8563423156738281, + "num_tokens": 53342800.0, + "step": 1398 + }, + { + "epoch": 0.17796717974812365, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.7736650705337524, + "learning_rate": 5.926239932174651e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8367466926574707, + "num_tokens": 53387476.0, + "step": 1399 + }, + { + "epoch": 0.17809439002671415, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.772553563117981, + "learning_rate": 5.930479016532429e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8484395742416382, + "num_tokens": 53424628.0, + "step": 1400 + }, + { + "epoch": 0.17822160030530468, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.7522403001785278, + "learning_rate": 5.934718100890207e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8511541485786438, + "num_tokens": 53464459.0, + "step": 1401 + }, + { + "epoch": 0.17834881058389518, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.8679627180099487, + "learning_rate": 5.938957185247986e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8362318873405457, + "num_tokens": 53501721.0, + "step": 1402 + }, + { + "epoch": 0.17847602086248568, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.9278903007507324, + "learning_rate": 5.943196269605765e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8481459617614746, + "num_tokens": 53536607.0, + "step": 1403 + }, + { + "epoch": 0.1786032311410762, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 2.222546100616455, + "learning_rate": 5.947435353963544e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8411362767219543, + "num_tokens": 53569809.0, + "step": 1404 + }, + { + "epoch": 0.1787304414196667, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.9091140031814575, + "learning_rate": 5.951674438321323e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8449848890304565, + "num_tokens": 53604423.0, + "step": 1405 + }, + { + "epoch": 0.1788576516982572, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.8680616617202759, + "learning_rate": 5.955913522679101e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8415789604187012, + "num_tokens": 53642145.0, + "step": 1406 + }, + { + "epoch": 0.17898486197684774, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.7886080741882324, + "learning_rate": 5.96015260703688e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.821000874042511, + "num_tokens": 53682998.0, + "step": 1407 + }, + { + "epoch": 0.17911207225543824, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 2.1114723682403564, + "learning_rate": 5.964391691394659e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8455042839050293, + "num_tokens": 53729764.0, + "step": 1408 + }, + { + "epoch": 0.17923928253402874, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.8981044292449951, + "learning_rate": 5.968630775752436e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8326504230499268, + "num_tokens": 53764522.0, + "step": 1409 + }, + { + "epoch": 0.17936649281261927, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 2.0138516426086426, + "learning_rate": 5.972869860110216e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8344353437423706, + "num_tokens": 53799791.0, + "step": 1410 + }, + { + "epoch": 0.17949370309120977, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.8212637901306152, + "learning_rate": 5.977108944467994e-07, + "loss": 0.546, + "mean_token_accuracy": 0.830251932144165, + "num_tokens": 53840181.0, + "step": 1411 + }, + { + "epoch": 0.17962091336980027, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.6826436519622803, + "learning_rate": 5.981348028825774e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.857054591178894, + "num_tokens": 53882813.0, + "step": 1412 + }, + { + "epoch": 0.1797481236483908, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.7656329870224, + "learning_rate": 5.985587113183552e-07, + "loss": 0.501, + "mean_token_accuracy": 0.8411363363265991, + "num_tokens": 53922198.0, + "step": 1413 + }, + { + "epoch": 0.1798753339269813, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.8044503927230835, + "learning_rate": 5.989826197541331e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8411651253700256, + "num_tokens": 53959390.0, + "step": 1414 + }, + { + "epoch": 0.18000254420557182, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.7783688306808472, + "learning_rate": 5.99406528189911e-07, + "loss": 0.5222, + "mean_token_accuracy": 0.8382962942123413, + "num_tokens": 54005707.0, + "step": 1415 + }, + { + "epoch": 0.18012975448416232, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.6786940097808838, + "learning_rate": 5.998304366256888e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.843585729598999, + "num_tokens": 54047808.0, + "step": 1416 + }, + { + "epoch": 0.18025696476275282, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.8076890707015991, + "learning_rate": 6.002543450614666e-07, + "loss": 0.4132, + "mean_token_accuracy": 0.8693097829818726, + "num_tokens": 54082929.0, + "step": 1417 + }, + { + "epoch": 0.18038417504134335, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.8841625452041626, + "learning_rate": 6.006782534972446e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.8484440445899963, + "num_tokens": 54120199.0, + "step": 1418 + }, + { + "epoch": 0.18051138531993385, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.9885525703430176, + "learning_rate": 6.011021619330224e-07, + "loss": 0.5094, + "mean_token_accuracy": 0.8368932604789734, + "num_tokens": 54158629.0, + "step": 1419 + }, + { + "epoch": 0.18063859559852435, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.9665199518203735, + "learning_rate": 6.015260703688004e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.842444658279419, + "num_tokens": 54194772.0, + "step": 1420 + }, + { + "epoch": 0.18076580587711488, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 2.041457176208496, + "learning_rate": 6.019499788045782e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.8429962396621704, + "num_tokens": 54229825.0, + "step": 1421 + }, + { + "epoch": 0.18089301615570538, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 2.090761184692383, + "learning_rate": 6.023738872403561e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.8528155088424683, + "num_tokens": 54260051.0, + "step": 1422 + }, + { + "epoch": 0.18102022643429588, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.852020263671875, + "learning_rate": 6.02797795676134e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8496158123016357, + "num_tokens": 54297865.0, + "step": 1423 + }, + { + "epoch": 0.1811474367128864, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.9357762336730957, + "learning_rate": 6.032217041119118e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8367676734924316, + "num_tokens": 54334618.0, + "step": 1424 + }, + { + "epoch": 0.1812746469914769, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.923691749572754, + "learning_rate": 6.036456125476896e-07, + "loss": 0.5299, + "mean_token_accuracy": 0.8301742076873779, + "num_tokens": 54369761.0, + "step": 1425 + }, + { + "epoch": 0.1814018572700674, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.6842120885849, + "learning_rate": 6.040695209834675e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.85804283618927, + "num_tokens": 54411549.0, + "step": 1426 + }, + { + "epoch": 0.18152906754865794, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.7816669940948486, + "learning_rate": 6.044934294192454e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8364987373352051, + "num_tokens": 54449137.0, + "step": 1427 + }, + { + "epoch": 0.18165627782724844, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.744051456451416, + "learning_rate": 6.049173378550233e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8509101867675781, + "num_tokens": 54494604.0, + "step": 1428 + }, + { + "epoch": 0.18178348810583894, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.9742447137832642, + "learning_rate": 6.053412462908012e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8526036143302917, + "num_tokens": 54533533.0, + "step": 1429 + }, + { + "epoch": 0.18191069838442947, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.7537025213241577, + "learning_rate": 6.05765154726579e-07, + "loss": 0.523, + "mean_token_accuracy": 0.8342210054397583, + "num_tokens": 54575211.0, + "step": 1430 + }, + { + "epoch": 0.18203790866301997, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 1.6662633419036865, + "learning_rate": 6.061890631623569e-07, + "loss": 0.4311, + "mean_token_accuracy": 0.8602412939071655, + "num_tokens": 54618308.0, + "step": 1431 + }, + { + "epoch": 0.18216511894161047, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.6670254468917847, + "learning_rate": 6.066129715981347e-07, + "loss": 0.469, + "mean_token_accuracy": 0.848567008972168, + "num_tokens": 54660260.0, + "step": 1432 + }, + { + "epoch": 0.182292329220201, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.7935004234313965, + "learning_rate": 6.070368800339126e-07, + "loss": 0.4367, + "mean_token_accuracy": 0.8584344387054443, + "num_tokens": 54692056.0, + "step": 1433 + }, + { + "epoch": 0.1824195394987915, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.8572301864624023, + "learning_rate": 6.074607884696905e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8499151468276978, + "num_tokens": 54729252.0, + "step": 1434 + }, + { + "epoch": 0.182546749777382, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.883238673210144, + "learning_rate": 6.078846969054684e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.8436053991317749, + "num_tokens": 54763422.0, + "step": 1435 + }, + { + "epoch": 0.18267396005597253, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.6355199813842773, + "learning_rate": 6.083086053412463e-07, + "loss": 0.4616, + "mean_token_accuracy": 0.8504098653793335, + "num_tokens": 54803494.0, + "step": 1436 + }, + { + "epoch": 0.18280117033456303, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.8942097425460815, + "learning_rate": 6.087325137770242e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8538333177566528, + "num_tokens": 54839349.0, + "step": 1437 + }, + { + "epoch": 0.18292838061315353, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.8051234483718872, + "learning_rate": 6.09156422212802e-07, + "loss": 0.5212, + "mean_token_accuracy": 0.8382879495620728, + "num_tokens": 54881941.0, + "step": 1438 + }, + { + "epoch": 0.18305559089174406, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 1.8202751874923706, + "learning_rate": 6.095803306485799e-07, + "loss": 0.4733, + "mean_token_accuracy": 0.8448268175125122, + "num_tokens": 54922927.0, + "step": 1439 + }, + { + "epoch": 0.18318280117033456, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 1.8593372106552124, + "learning_rate": 6.100042390843577e-07, + "loss": 0.4982, + "mean_token_accuracy": 0.84023118019104, + "num_tokens": 54963885.0, + "step": 1440 + }, + { + "epoch": 0.1833100114489251, + "ewc_loss": 2.8908252716064453e-06, + "grad_norm": 1.8922613859176636, + "learning_rate": 6.104281475201356e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8529536724090576, + "num_tokens": 55001284.0, + "step": 1441 + }, + { + "epoch": 0.1834372217275156, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.9231070280075073, + "learning_rate": 6.108520559559135e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.837833046913147, + "num_tokens": 55034813.0, + "step": 1442 + }, + { + "epoch": 0.1835644320061061, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.7645870447158813, + "learning_rate": 6.112759643916914e-07, + "loss": 0.4169, + "mean_token_accuracy": 0.8637853264808655, + "num_tokens": 55070620.0, + "step": 1443 + }, + { + "epoch": 0.18369164228469662, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.9680978059768677, + "learning_rate": 6.116998728274693e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8450661897659302, + "num_tokens": 55108394.0, + "step": 1444 + }, + { + "epoch": 0.18381885256328712, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.8344045877456665, + "learning_rate": 6.121237812632472e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.838098406791687, + "num_tokens": 55148170.0, + "step": 1445 + }, + { + "epoch": 0.18394606284187762, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.7058719396591187, + "learning_rate": 6.125476896990249e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8590095043182373, + "num_tokens": 55188102.0, + "step": 1446 + }, + { + "epoch": 0.18407327312046814, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.7183892726898193, + "learning_rate": 6.129715981348028e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8496086001396179, + "num_tokens": 55225893.0, + "step": 1447 + }, + { + "epoch": 0.18420048339905865, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.956484317779541, + "learning_rate": 6.133955065705807e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8369430303573608, + "num_tokens": 55265321.0, + "step": 1448 + }, + { + "epoch": 0.18432769367764915, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.7314732074737549, + "learning_rate": 6.138194150063585e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8516929149627686, + "num_tokens": 55304179.0, + "step": 1449 + }, + { + "epoch": 0.18445490395623967, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.8557595014572144, + "learning_rate": 6.142433234421365e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.839302659034729, + "num_tokens": 55343266.0, + "step": 1450 + }, + { + "epoch": 0.18458211423483017, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.8205150365829468, + "learning_rate": 6.146672318779143e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8399143218994141, + "num_tokens": 55381147.0, + "step": 1451 + }, + { + "epoch": 0.18470932451342068, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.9798582792282104, + "learning_rate": 6.150911403136923e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8417952060699463, + "num_tokens": 55414855.0, + "step": 1452 + }, + { + "epoch": 0.1848365347920112, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.8079450130462646, + "learning_rate": 6.155150487494701e-07, + "loss": 0.4241, + "mean_token_accuracy": 0.8626754283905029, + "num_tokens": 55449699.0, + "step": 1453 + }, + { + "epoch": 0.1849637450706017, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 2.1020874977111816, + "learning_rate": 6.159389571852479e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8554605841636658, + "num_tokens": 55493202.0, + "step": 1454 + }, + { + "epoch": 0.1850909553491922, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.7812994718551636, + "learning_rate": 6.163628656210258e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8500247597694397, + "num_tokens": 55539873.0, + "step": 1455 + }, + { + "epoch": 0.18521816562778273, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.9071301221847534, + "learning_rate": 6.167867740568037e-07, + "loss": 0.5322, + "mean_token_accuracy": 0.8320525884628296, + "num_tokens": 55576338.0, + "step": 1456 + }, + { + "epoch": 0.18534537590637323, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.7993794679641724, + "learning_rate": 6.172106824925815e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8519983291625977, + "num_tokens": 55618330.0, + "step": 1457 + }, + { + "epoch": 0.18547258618496373, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.713289499282837, + "learning_rate": 6.176345909283595e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.8473173975944519, + "num_tokens": 55657394.0, + "step": 1458 + }, + { + "epoch": 0.18559979646355426, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.7986695766448975, + "learning_rate": 6.180584993641373e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8536700010299683, + "num_tokens": 55695087.0, + "step": 1459 + }, + { + "epoch": 0.18572700674214476, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.7753280401229858, + "learning_rate": 6.184824077999153e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8565807342529297, + "num_tokens": 55730852.0, + "step": 1460 + }, + { + "epoch": 0.18585421702073526, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.9010530710220337, + "learning_rate": 6.189063162356931e-07, + "loss": 0.5356, + "mean_token_accuracy": 0.8312567472457886, + "num_tokens": 55765678.0, + "step": 1461 + }, + { + "epoch": 0.1859814272993258, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.8294073343276978, + "learning_rate": 6.193302246714709e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8537781834602356, + "num_tokens": 55804409.0, + "step": 1462 + }, + { + "epoch": 0.1861086375779163, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.9687104225158691, + "learning_rate": 6.197541331072488e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8413985967636108, + "num_tokens": 55839520.0, + "step": 1463 + }, + { + "epoch": 0.1862358478565068, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.8224916458129883, + "learning_rate": 6.201780415430267e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8480668067932129, + "num_tokens": 55879089.0, + "step": 1464 + }, + { + "epoch": 0.18636305813509732, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 2.070615291595459, + "learning_rate": 6.206019499788045e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.836257815361023, + "num_tokens": 55916655.0, + "step": 1465 + }, + { + "epoch": 0.18649026841368782, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.74919855594635, + "learning_rate": 6.210258584145825e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.8536334037780762, + "num_tokens": 55953828.0, + "step": 1466 + }, + { + "epoch": 0.18661747869227835, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.8539994955062866, + "learning_rate": 6.214497668503603e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8367562890052795, + "num_tokens": 55992037.0, + "step": 1467 + }, + { + "epoch": 0.18674468897086885, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 2.089397430419922, + "learning_rate": 6.218736752861383e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8539903163909912, + "num_tokens": 56020424.0, + "step": 1468 + }, + { + "epoch": 0.18687189924945935, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.8568438291549683, + "learning_rate": 6.22297583721916e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8462997674942017, + "num_tokens": 56060839.0, + "step": 1469 + }, + { + "epoch": 0.18699910952804988, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.8849619626998901, + "learning_rate": 6.227214921576938e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8384151458740234, + "num_tokens": 56099095.0, + "step": 1470 + }, + { + "epoch": 0.18712631980664038, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.982667088508606, + "learning_rate": 6.231454005934718e-07, + "loss": 0.4204, + "mean_token_accuracy": 0.8650259375572205, + "num_tokens": 56133805.0, + "step": 1471 + }, + { + "epoch": 0.18725353008523088, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 2.052377700805664, + "learning_rate": 6.235693090292496e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.832054615020752, + "num_tokens": 56173064.0, + "step": 1472 + }, + { + "epoch": 0.1873807403638214, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.6226985454559326, + "learning_rate": 6.239932174650275e-07, + "loss": 0.4207, + "mean_token_accuracy": 0.8663656711578369, + "num_tokens": 56213285.0, + "step": 1473 + }, + { + "epoch": 0.1875079506424119, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.9595893621444702, + "learning_rate": 6.244171259008054e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8376631140708923, + "num_tokens": 56248218.0, + "step": 1474 + }, + { + "epoch": 0.1876351609210024, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.677027702331543, + "learning_rate": 6.248410343365833e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.857501745223999, + "num_tokens": 56286098.0, + "step": 1475 + }, + { + "epoch": 0.18776237119959294, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.6470980644226074, + "learning_rate": 6.252649427723612e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8570352792739868, + "num_tokens": 56329732.0, + "step": 1476 + }, + { + "epoch": 0.18788958147818344, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.7064398527145386, + "learning_rate": 6.25688851208139e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8539384603500366, + "num_tokens": 56367838.0, + "step": 1477 + }, + { + "epoch": 0.18801679175677394, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.6998534202575684, + "learning_rate": 6.261127596439168e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8351505994796753, + "num_tokens": 56413059.0, + "step": 1478 + }, + { + "epoch": 0.18814400203536447, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.6355558633804321, + "learning_rate": 6.265366680796948e-07, + "loss": 0.4151, + "mean_token_accuracy": 0.8615801334381104, + "num_tokens": 56453806.0, + "step": 1479 + }, + { + "epoch": 0.18827121231395497, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.9041707515716553, + "learning_rate": 6.269605765154726e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8501454591751099, + "num_tokens": 56497448.0, + "step": 1480 + }, + { + "epoch": 0.18839842259254547, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.9106298685073853, + "learning_rate": 6.273844849512505e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8497055172920227, + "num_tokens": 56537731.0, + "step": 1481 + }, + { + "epoch": 0.188525632871136, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.657104253768921, + "learning_rate": 6.278083933870284e-07, + "loss": 0.4049, + "mean_token_accuracy": 0.8699944615364075, + "num_tokens": 56580944.0, + "step": 1482 + }, + { + "epoch": 0.1886528431497265, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.9864907264709473, + "learning_rate": 6.282323018228063e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8611783981323242, + "num_tokens": 56616306.0, + "step": 1483 + }, + { + "epoch": 0.188780053428317, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.7280224561691284, + "learning_rate": 6.286562102585841e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8394841551780701, + "num_tokens": 56661874.0, + "step": 1484 + }, + { + "epoch": 0.18890726370690752, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.8850722312927246, + "learning_rate": 6.29080118694362e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.8660619854927063, + "num_tokens": 56696697.0, + "step": 1485 + }, + { + "epoch": 0.18903447398549802, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.887642502784729, + "learning_rate": 6.295040271301398e-07, + "loss": 0.4113, + "mean_token_accuracy": 0.8646906614303589, + "num_tokens": 56732581.0, + "step": 1486 + }, + { + "epoch": 0.18916168426408853, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.7161445617675781, + "learning_rate": 6.299279355659178e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8537688255310059, + "num_tokens": 56777905.0, + "step": 1487 + }, + { + "epoch": 0.18928889454267905, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.9175200462341309, + "learning_rate": 6.303518440016956e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8459310531616211, + "num_tokens": 56812915.0, + "step": 1488 + }, + { + "epoch": 0.18941610482126955, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 2.00046706199646, + "learning_rate": 6.307757524374735e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8510928153991699, + "num_tokens": 56846907.0, + "step": 1489 + }, + { + "epoch": 0.18954331509986008, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.9331223964691162, + "learning_rate": 6.311996608732514e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8438471555709839, + "num_tokens": 56884614.0, + "step": 1490 + }, + { + "epoch": 0.18967052537845058, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.845422387123108, + "learning_rate": 6.316235693090292e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8403511047363281, + "num_tokens": 56920508.0, + "step": 1491 + }, + { + "epoch": 0.18979773565704108, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.965528130531311, + "learning_rate": 6.320474777448071e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8408972024917603, + "num_tokens": 56957629.0, + "step": 1492 + }, + { + "epoch": 0.1899249459356316, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.9060804843902588, + "learning_rate": 6.324713861805849e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.8509709239006042, + "num_tokens": 56997265.0, + "step": 1493 + }, + { + "epoch": 0.1900521562142221, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 3.646972179412842, + "learning_rate": 6.328952946163628e-07, + "loss": 0.5537, + "mean_token_accuracy": 0.826714813709259, + "num_tokens": 57031519.0, + "step": 1494 + }, + { + "epoch": 0.1901793664928126, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.7423266172409058, + "learning_rate": 6.333192030521407e-07, + "loss": 0.4281, + "mean_token_accuracy": 0.8587162494659424, + "num_tokens": 57067264.0, + "step": 1495 + }, + { + "epoch": 0.19030657677140314, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.8852397203445435, + "learning_rate": 6.337431114879186e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8469456434249878, + "num_tokens": 57104906.0, + "step": 1496 + }, + { + "epoch": 0.19043378704999364, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.811389684677124, + "learning_rate": 6.341670199236965e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8563173413276672, + "num_tokens": 57142534.0, + "step": 1497 + }, + { + "epoch": 0.19056099732858414, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.5539416074752808, + "learning_rate": 6.345909283594744e-07, + "loss": 0.4295, + "mean_token_accuracy": 0.8609368801116943, + "num_tokens": 57188735.0, + "step": 1498 + }, + { + "epoch": 0.19068820760717467, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.9187077283859253, + "learning_rate": 6.350148367952522e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8394060134887695, + "num_tokens": 57224610.0, + "step": 1499 + }, + { + "epoch": 0.19081541788576517, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.889927864074707, + "learning_rate": 6.354387452310301e-07, + "loss": 0.4381, + "mean_token_accuracy": 0.8554925918579102, + "num_tokens": 57259553.0, + "step": 1500 + }, + { + "epoch": 0.19094262816435567, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.9717644453048706, + "learning_rate": 6.358626536668079e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.8576075434684753, + "num_tokens": 57295819.0, + "step": 1501 + }, + { + "epoch": 0.1910698384429462, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.6093679666519165, + "learning_rate": 6.362865621025858e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8621025085449219, + "num_tokens": 57339110.0, + "step": 1502 + }, + { + "epoch": 0.1911970487215367, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.8532932996749878, + "learning_rate": 6.367104705383637e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.852704644203186, + "num_tokens": 57377311.0, + "step": 1503 + }, + { + "epoch": 0.1913242590001272, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.7817391157150269, + "learning_rate": 6.371343789741416e-07, + "loss": 0.4685, + "mean_token_accuracy": 0.8452835083007812, + "num_tokens": 57419281.0, + "step": 1504 + }, + { + "epoch": 0.19145146927871773, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.9092254638671875, + "learning_rate": 6.375582874099195e-07, + "loss": 0.4589, + "mean_token_accuracy": 0.8512453436851501, + "num_tokens": 57458134.0, + "step": 1505 + }, + { + "epoch": 0.19157867955730823, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.9006339311599731, + "learning_rate": 6.379821958456974e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8356114625930786, + "num_tokens": 57492931.0, + "step": 1506 + }, + { + "epoch": 0.19170588983589873, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.8036749362945557, + "learning_rate": 6.384061042814751e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8449337482452393, + "num_tokens": 57529581.0, + "step": 1507 + }, + { + "epoch": 0.19183310011448926, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.7354774475097656, + "learning_rate": 6.38830012717253e-07, + "loss": 0.473, + "mean_token_accuracy": 0.8485745191574097, + "num_tokens": 57570302.0, + "step": 1508 + }, + { + "epoch": 0.19196031039307976, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.9400560855865479, + "learning_rate": 6.392539211530309e-07, + "loss": 0.4809, + "mean_token_accuracy": 0.8439730405807495, + "num_tokens": 57615974.0, + "step": 1509 + }, + { + "epoch": 0.19208752067167026, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.8349168300628662, + "learning_rate": 6.396778295888087e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8595380783081055, + "num_tokens": 57653737.0, + "step": 1510 + }, + { + "epoch": 0.1922147309502608, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.9043350219726562, + "learning_rate": 6.401017380245867e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8452844619750977, + "num_tokens": 57693431.0, + "step": 1511 + }, + { + "epoch": 0.1923419412288513, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.6901817321777344, + "learning_rate": 6.405256464603645e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8490825891494751, + "num_tokens": 57736413.0, + "step": 1512 + }, + { + "epoch": 0.1924691515074418, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.7710310220718384, + "learning_rate": 6.409495548961425e-07, + "loss": 0.4485, + "mean_token_accuracy": 0.8576619625091553, + "num_tokens": 57773293.0, + "step": 1513 + }, + { + "epoch": 0.19259636178603232, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.8812586069107056, + "learning_rate": 6.413734633319203e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8520947694778442, + "num_tokens": 57809294.0, + "step": 1514 + }, + { + "epoch": 0.19272357206462282, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.8593816757202148, + "learning_rate": 6.417973717676981e-07, + "loss": 0.434, + "mean_token_accuracy": 0.8577212691307068, + "num_tokens": 57843610.0, + "step": 1515 + }, + { + "epoch": 0.19285078234321335, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.665807843208313, + "learning_rate": 6.42221280203476e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8382047414779663, + "num_tokens": 57893098.0, + "step": 1516 + }, + { + "epoch": 0.19297799262180385, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.7916035652160645, + "learning_rate": 6.426451886392539e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8382852077484131, + "num_tokens": 57933098.0, + "step": 1517 + }, + { + "epoch": 0.19310520290039435, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.813440203666687, + "learning_rate": 6.430690970750317e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.8537203669548035, + "num_tokens": 57971502.0, + "step": 1518 + }, + { + "epoch": 0.19323241317898487, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 2.0428879261016846, + "learning_rate": 6.434930055108097e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8407723903656006, + "num_tokens": 58001557.0, + "step": 1519 + }, + { + "epoch": 0.19335962345757537, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.9815102815628052, + "learning_rate": 6.439169139465875e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8373013734817505, + "num_tokens": 58038058.0, + "step": 1520 + }, + { + "epoch": 0.19348683373616588, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.8329553604125977, + "learning_rate": 6.443408223823655e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8445678949356079, + "num_tokens": 58073138.0, + "step": 1521 + }, + { + "epoch": 0.1936140440147564, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.732260823249817, + "learning_rate": 6.447647308181432e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.848922610282898, + "num_tokens": 58111793.0, + "step": 1522 + }, + { + "epoch": 0.1937412542933469, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.726630449295044, + "learning_rate": 6.451886392539211e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8514116406440735, + "num_tokens": 58152911.0, + "step": 1523 + }, + { + "epoch": 0.1938684645719374, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.8036375045776367, + "learning_rate": 6.45612547689699e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8481590747833252, + "num_tokens": 58192095.0, + "step": 1524 + }, + { + "epoch": 0.19399567485052793, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.7259559631347656, + "learning_rate": 6.460364561254769e-07, + "loss": 0.43, + "mean_token_accuracy": 0.860451340675354, + "num_tokens": 58232651.0, + "step": 1525 + }, + { + "epoch": 0.19412288512911843, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.712518572807312, + "learning_rate": 6.464603645612547e-07, + "loss": 0.4284, + "mean_token_accuracy": 0.8616160154342651, + "num_tokens": 58270802.0, + "step": 1526 + }, + { + "epoch": 0.19425009540770893, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.8237241506576538, + "learning_rate": 6.468842729970327e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8538206815719604, + "num_tokens": 58313179.0, + "step": 1527 + }, + { + "epoch": 0.19437730568629946, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.9194743633270264, + "learning_rate": 6.473081814328105e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8448877334594727, + "num_tokens": 58354728.0, + "step": 1528 + }, + { + "epoch": 0.19450451596488996, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.7689942121505737, + "learning_rate": 6.477320898685885e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8484020233154297, + "num_tokens": 58398649.0, + "step": 1529 + }, + { + "epoch": 0.19463172624348046, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.7969797849655151, + "learning_rate": 6.481559983043662e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8408728837966919, + "num_tokens": 58438261.0, + "step": 1530 + }, + { + "epoch": 0.194758936522071, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.9633530378341675, + "learning_rate": 6.48579906740144e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8571980595588684, + "num_tokens": 58479141.0, + "step": 1531 + }, + { + "epoch": 0.1948861468006615, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.7750929594039917, + "learning_rate": 6.49003815175922e-07, + "loss": 0.4089, + "mean_token_accuracy": 0.8693996071815491, + "num_tokens": 58515700.0, + "step": 1532 + }, + { + "epoch": 0.195013357079252, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.9003506898880005, + "learning_rate": 6.494277236116998e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8659622669219971, + "num_tokens": 58549825.0, + "step": 1533 + }, + { + "epoch": 0.19514056735784252, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.673403263092041, + "learning_rate": 6.498516320474777e-07, + "loss": 0.5538, + "mean_token_accuracy": 0.8271806836128235, + "num_tokens": 58596110.0, + "step": 1534 + }, + { + "epoch": 0.19526777763643302, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.791151762008667, + "learning_rate": 6.502755404832556e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.850132167339325, + "num_tokens": 58632410.0, + "step": 1535 + }, + { + "epoch": 0.19539498791502352, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.7913215160369873, + "learning_rate": 6.506994489190335e-07, + "loss": 0.5254, + "mean_token_accuracy": 0.836921751499176, + "num_tokens": 58671339.0, + "step": 1536 + }, + { + "epoch": 0.19552219819361405, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.7253444194793701, + "learning_rate": 6.511233573548114e-07, + "loss": 0.4272, + "mean_token_accuracy": 0.862470269203186, + "num_tokens": 58711786.0, + "step": 1537 + }, + { + "epoch": 0.19564940847220455, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.9387381076812744, + "learning_rate": 6.515472657905892e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8487739562988281, + "num_tokens": 58743600.0, + "step": 1538 + }, + { + "epoch": 0.19577661875079505, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 1.943076252937317, + "learning_rate": 6.51971174226367e-07, + "loss": 0.5246, + "mean_token_accuracy": 0.8334605097770691, + "num_tokens": 58780622.0, + "step": 1539 + }, + { + "epoch": 0.19590382902938558, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 1.8154728412628174, + "learning_rate": 6.52395082662145e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8350876569747925, + "num_tokens": 58819370.0, + "step": 1540 + }, + { + "epoch": 0.19603103930797608, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 1.9569050073623657, + "learning_rate": 6.528189910979228e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.8569519519805908, + "num_tokens": 58856270.0, + "step": 1541 + }, + { + "epoch": 0.1961582495865666, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 1.6619727611541748, + "learning_rate": 6.532428995337007e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.851990818977356, + "num_tokens": 58898702.0, + "step": 1542 + }, + { + "epoch": 0.1962854598651571, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 1.8681546449661255, + "learning_rate": 6.536668079694786e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8434100151062012, + "num_tokens": 58934602.0, + "step": 1543 + }, + { + "epoch": 0.1964126701437476, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 1.668053150177002, + "learning_rate": 6.540907164052565e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8483133912086487, + "num_tokens": 58977766.0, + "step": 1544 + }, + { + "epoch": 0.19653988042233814, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.7067177295684814, + "learning_rate": 6.545146248410343e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8459742069244385, + "num_tokens": 59019649.0, + "step": 1545 + }, + { + "epoch": 0.19666709070092864, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.8002787828445435, + "learning_rate": 6.549385332768122e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8445346355438232, + "num_tokens": 59060543.0, + "step": 1546 + }, + { + "epoch": 0.19679430097951914, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.8153480291366577, + "learning_rate": 6.5536244171259e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8423080444335938, + "num_tokens": 59100049.0, + "step": 1547 + }, + { + "epoch": 0.19692151125810967, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.8694499731063843, + "learning_rate": 6.55786350148368e-07, + "loss": 0.4938, + "mean_token_accuracy": 0.8418875336647034, + "num_tokens": 59137370.0, + "step": 1548 + }, + { + "epoch": 0.19704872153670017, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.9396705627441406, + "learning_rate": 6.562102585841458e-07, + "loss": 0.4493, + "mean_token_accuracy": 0.8502947092056274, + "num_tokens": 59176196.0, + "step": 1549 + }, + { + "epoch": 0.19717593181529067, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.7401319742202759, + "learning_rate": 6.566341670199236e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8561704754829407, + "num_tokens": 59216215.0, + "step": 1550 + }, + { + "epoch": 0.1973031420938812, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.7921488285064697, + "learning_rate": 6.570580754557016e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8507834076881409, + "num_tokens": 59254173.0, + "step": 1551 + }, + { + "epoch": 0.1974303523724717, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.8381954431533813, + "learning_rate": 6.574819838914794e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8601614236831665, + "num_tokens": 59288496.0, + "step": 1552 + }, + { + "epoch": 0.1975575626510622, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.759421706199646, + "learning_rate": 6.579058923272573e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.8411665558815002, + "num_tokens": 59329043.0, + "step": 1553 + }, + { + "epoch": 0.19768477292965272, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 2.0112502574920654, + "learning_rate": 6.583298007630351e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8459339141845703, + "num_tokens": 59360743.0, + "step": 1554 + }, + { + "epoch": 0.19781198320824323, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.7812104225158691, + "learning_rate": 6.58753709198813e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.845299243927002, + "num_tokens": 59404167.0, + "step": 1555 + }, + { + "epoch": 0.19793919348683373, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.6301536560058594, + "learning_rate": 6.591776176345909e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.851862907409668, + "num_tokens": 59447770.0, + "step": 1556 + }, + { + "epoch": 0.19806640376542425, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.6764169931411743, + "learning_rate": 6.596015260703688e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.8562158346176147, + "num_tokens": 59494989.0, + "step": 1557 + }, + { + "epoch": 0.19819361404401475, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 1.7810251712799072, + "learning_rate": 6.600254345061466e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8576209545135498, + "num_tokens": 59534926.0, + "step": 1558 + }, + { + "epoch": 0.19832082432260525, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.0358479022979736, + "learning_rate": 6.604493429419246e-07, + "loss": 0.501, + "mean_token_accuracy": 0.8400030732154846, + "num_tokens": 59577832.0, + "step": 1559 + }, + { + "epoch": 0.19844803460119578, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.7499265670776367, + "learning_rate": 6.608732513777023e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.844947338104248, + "num_tokens": 59624995.0, + "step": 1560 + }, + { + "epoch": 0.19857524487978628, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.7943778038024902, + "learning_rate": 6.612971598134803e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8520607948303223, + "num_tokens": 59663000.0, + "step": 1561 + }, + { + "epoch": 0.19870245515837678, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.9354451894760132, + "learning_rate": 6.617210682492581e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.8367595672607422, + "num_tokens": 59700241.0, + "step": 1562 + }, + { + "epoch": 0.1988296654369673, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.9383496046066284, + "learning_rate": 6.62144976685036e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8613333702087402, + "num_tokens": 59737244.0, + "step": 1563 + }, + { + "epoch": 0.1989568757155578, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.9644484519958496, + "learning_rate": 6.625688851208139e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8454301357269287, + "num_tokens": 59775538.0, + "step": 1564 + }, + { + "epoch": 0.19908408599414834, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.9146957397460938, + "learning_rate": 6.629927935565918e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8436803817749023, + "num_tokens": 59810373.0, + "step": 1565 + }, + { + "epoch": 0.19921129627273884, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.573381185531616, + "learning_rate": 6.634167019923696e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8416246771812439, + "num_tokens": 59849992.0, + "step": 1566 + }, + { + "epoch": 0.19933850655132934, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.9097373485565186, + "learning_rate": 6.638406104281476e-07, + "loss": 0.4699, + "mean_token_accuracy": 0.8489469289779663, + "num_tokens": 59882552.0, + "step": 1567 + }, + { + "epoch": 0.19946571682991987, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.7452540397644043, + "learning_rate": 6.642645188639253e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8544020652770996, + "num_tokens": 59925090.0, + "step": 1568 + }, + { + "epoch": 0.19959292710851037, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.8718937635421753, + "learning_rate": 6.646884272997032e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.8424782156944275, + "num_tokens": 59965406.0, + "step": 1569 + }, + { + "epoch": 0.19972013738710087, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.8052318096160889, + "learning_rate": 6.651123357354811e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8465441465377808, + "num_tokens": 60001770.0, + "step": 1570 + }, + { + "epoch": 0.1998473476656914, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.8364570140838623, + "learning_rate": 6.655362441712589e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8374145030975342, + "num_tokens": 60036911.0, + "step": 1571 + }, + { + "epoch": 0.1999745579442819, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.8988865613937378, + "learning_rate": 6.659601526070369e-07, + "loss": 0.5247, + "mean_token_accuracy": 0.836906373500824, + "num_tokens": 60080210.0, + "step": 1572 + }, + { + "epoch": 0.2001017682228724, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.1462807655334473, + "learning_rate": 6.663840610428147e-07, + "loss": 0.5581, + "mean_token_accuracy": 0.828713059425354, + "num_tokens": 60111533.0, + "step": 1573 + }, + { + "epoch": 0.20022897850146293, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.9519898891448975, + "learning_rate": 6.668079694785926e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8586897850036621, + "num_tokens": 60150733.0, + "step": 1574 + }, + { + "epoch": 0.20035618878005343, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.8457989692687988, + "learning_rate": 6.672318779143704e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8550764322280884, + "num_tokens": 60185034.0, + "step": 1575 + }, + { + "epoch": 0.20048339905864393, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.080641984939575, + "learning_rate": 6.676557863501483e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8447853326797485, + "num_tokens": 60216865.0, + "step": 1576 + }, + { + "epoch": 0.20061060933723446, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.7470276355743408, + "learning_rate": 6.680796947859262e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8471399545669556, + "num_tokens": 60257685.0, + "step": 1577 + }, + { + "epoch": 0.20073781961582496, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.3122804164886475, + "learning_rate": 6.685036032217041e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8494695425033569, + "num_tokens": 60294258.0, + "step": 1578 + }, + { + "epoch": 0.20086502989441546, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.7432254552841187, + "learning_rate": 6.689275116574819e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8557730913162231, + "num_tokens": 60330319.0, + "step": 1579 + }, + { + "epoch": 0.200992240173006, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.7218809127807617, + "learning_rate": 6.693514200932599e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8495659828186035, + "num_tokens": 60370482.0, + "step": 1580 + }, + { + "epoch": 0.2011194504515965, + "ewc_loss": 3.0994415283203125e-06, + "grad_norm": 1.9750280380249023, + "learning_rate": 6.697753285290377e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8445959687232971, + "num_tokens": 60412436.0, + "step": 1581 + }, + { + "epoch": 0.201246660730187, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.9287441968917847, + "learning_rate": 6.701992369648156e-07, + "loss": 0.4068, + "mean_token_accuracy": 0.8696110248565674, + "num_tokens": 60448982.0, + "step": 1582 + }, + { + "epoch": 0.20137387100877752, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.7713526487350464, + "learning_rate": 6.706231454005934e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.861647367477417, + "num_tokens": 60489878.0, + "step": 1583 + }, + { + "epoch": 0.20150108128736802, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.7941218614578247, + "learning_rate": 6.710470538363713e-07, + "loss": 0.4593, + "mean_token_accuracy": 0.8549424409866333, + "num_tokens": 60528979.0, + "step": 1584 + }, + { + "epoch": 0.20162829156595852, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.9022455215454102, + "learning_rate": 6.714709622721492e-07, + "loss": 0.489, + "mean_token_accuracy": 0.8428617119789124, + "num_tokens": 60561670.0, + "step": 1585 + }, + { + "epoch": 0.20175550184454905, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.7859922647476196, + "learning_rate": 6.718948707079271e-07, + "loss": 0.4616, + "mean_token_accuracy": 0.8516948819160461, + "num_tokens": 60602061.0, + "step": 1586 + }, + { + "epoch": 0.20188271212313955, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.6943683624267578, + "learning_rate": 6.723187791437049e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.851991593837738, + "num_tokens": 60639805.0, + "step": 1587 + }, + { + "epoch": 0.20200992240173005, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.6960370540618896, + "learning_rate": 6.727426875794829e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8434246778488159, + "num_tokens": 60681816.0, + "step": 1588 + }, + { + "epoch": 0.20213713268032057, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.8205558061599731, + "learning_rate": 6.731665960152607e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8449302911758423, + "num_tokens": 60726343.0, + "step": 1589 + }, + { + "epoch": 0.20226434295891108, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.8724631071090698, + "learning_rate": 6.735905044510385e-07, + "loss": 0.5204, + "mean_token_accuracy": 0.8281002044677734, + "num_tokens": 60764814.0, + "step": 1590 + }, + { + "epoch": 0.2023915532375016, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.8342652320861816, + "learning_rate": 6.740144128868164e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8447058796882629, + "num_tokens": 60802384.0, + "step": 1591 + }, + { + "epoch": 0.2025187635160921, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 2.253840923309326, + "learning_rate": 6.744383213225942e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.8360943794250488, + "num_tokens": 60842085.0, + "step": 1592 + }, + { + "epoch": 0.2026459737946826, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.859409213066101, + "learning_rate": 6.748622297583722e-07, + "loss": 0.4493, + "mean_token_accuracy": 0.8522511720657349, + "num_tokens": 60874835.0, + "step": 1593 + }, + { + "epoch": 0.20277318407327313, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.8242491483688354, + "learning_rate": 6.7528613819415e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8454856872558594, + "num_tokens": 60910682.0, + "step": 1594 + }, + { + "epoch": 0.20290039435186363, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.734654426574707, + "learning_rate": 6.757100466299279e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8501155376434326, + "num_tokens": 60951352.0, + "step": 1595 + }, + { + "epoch": 0.20302760463045413, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 2.730703353881836, + "learning_rate": 6.761339550657058e-07, + "loss": 0.496, + "mean_token_accuracy": 0.840772271156311, + "num_tokens": 60979055.0, + "step": 1596 + }, + { + "epoch": 0.20315481490904466, + "ewc_loss": 3.1441450119018555e-06, + "grad_norm": 1.844314694404602, + "learning_rate": 6.765578635014837e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8490285873413086, + "num_tokens": 61017437.0, + "step": 1597 + }, + { + "epoch": 0.20328202518763516, + "ewc_loss": 3.1441450119018555e-06, + "grad_norm": 1.7947782278060913, + "learning_rate": 6.769817719372614e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8540197610855103, + "num_tokens": 61052286.0, + "step": 1598 + }, + { + "epoch": 0.20340923546622566, + "ewc_loss": 3.1441450119018555e-06, + "grad_norm": 1.92121160030365, + "learning_rate": 6.774056803730394e-07, + "loss": 0.5, + "mean_token_accuracy": 0.8403088450431824, + "num_tokens": 61086110.0, + "step": 1599 + }, + { + "epoch": 0.2035364457448162, + "ewc_loss": 3.1441450119018555e-06, + "grad_norm": 1.6592788696289062, + "learning_rate": 6.778295888088172e-07, + "loss": 0.4705, + "mean_token_accuracy": 0.8492551445960999, + "num_tokens": 61124181.0, + "step": 1600 + }, + { + "epoch": 0.2036636560234067, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 1.8568825721740723, + "learning_rate": 6.782534972445952e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8421577215194702, + "num_tokens": 61160881.0, + "step": 1601 + }, + { + "epoch": 0.2037908663019972, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 1.8060636520385742, + "learning_rate": 6.78677405680373e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.845796525478363, + "num_tokens": 61200755.0, + "step": 1602 + }, + { + "epoch": 0.20391807658058772, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 2.31023907661438, + "learning_rate": 6.791013141161509e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.843056321144104, + "num_tokens": 61240045.0, + "step": 1603 + }, + { + "epoch": 0.20404528685917822, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 1.7576969861984253, + "learning_rate": 6.795252225519288e-07, + "loss": 0.5046, + "mean_token_accuracy": 0.837045431137085, + "num_tokens": 61282832.0, + "step": 1604 + }, + { + "epoch": 0.20417249713776872, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 1.7713502645492554, + "learning_rate": 6.799491309877067e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8438114523887634, + "num_tokens": 61319841.0, + "step": 1605 + }, + { + "epoch": 0.20429970741635925, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 1.9523903131484985, + "learning_rate": 6.803730394234844e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8576165437698364, + "num_tokens": 61352732.0, + "step": 1606 + }, + { + "epoch": 0.20442691769494975, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.9184484481811523, + "learning_rate": 6.807969478592624e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.848355770111084, + "num_tokens": 61388324.0, + "step": 1607 + }, + { + "epoch": 0.20455412797354025, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.744529366493225, + "learning_rate": 6.812208562950402e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.8448489904403687, + "num_tokens": 61431222.0, + "step": 1608 + }, + { + "epoch": 0.20468133825213078, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.8781664371490479, + "learning_rate": 6.816447647308182e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.8302909135818481, + "num_tokens": 61475617.0, + "step": 1609 + }, + { + "epoch": 0.20480854853072128, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.8117992877960205, + "learning_rate": 6.82068673166596e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8531621694564819, + "num_tokens": 61511338.0, + "step": 1610 + }, + { + "epoch": 0.20493575880931178, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.8316640853881836, + "learning_rate": 6.824925816023738e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8441598415374756, + "num_tokens": 61549093.0, + "step": 1611 + }, + { + "epoch": 0.2050629690879023, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.6643104553222656, + "learning_rate": 6.829164900381518e-07, + "loss": 0.4519, + "mean_token_accuracy": 0.8544909358024597, + "num_tokens": 61593843.0, + "step": 1612 + }, + { + "epoch": 0.2051901793664928, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.8737584352493286, + "learning_rate": 6.833403984739295e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.8503782749176025, + "num_tokens": 61629362.0, + "step": 1613 + }, + { + "epoch": 0.2053173896450833, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.828622817993164, + "learning_rate": 6.837643069097074e-07, + "loss": 0.5434, + "mean_token_accuracy": 0.8278300166130066, + "num_tokens": 61667037.0, + "step": 1614 + }, + { + "epoch": 0.20544459992367384, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.804967999458313, + "learning_rate": 6.841882153454853e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8493970632553101, + "num_tokens": 61701976.0, + "step": 1615 + }, + { + "epoch": 0.20557181020226434, + "ewc_loss": 3.1888484954833984e-06, + "grad_norm": 1.8138444423675537, + "learning_rate": 6.846121237812632e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.861868143081665, + "num_tokens": 61739068.0, + "step": 1616 + }, + { + "epoch": 0.20569902048085487, + "ewc_loss": 3.1888484954833984e-06, + "grad_norm": 1.737855076789856, + "learning_rate": 6.850360322170411e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8553860187530518, + "num_tokens": 61781515.0, + "step": 1617 + }, + { + "epoch": 0.20582623075944537, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 4.067259311676025, + "learning_rate": 6.85459940652819e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.8495378494262695, + "num_tokens": 61815201.0, + "step": 1618 + }, + { + "epoch": 0.20595344103803587, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.9776848554611206, + "learning_rate": 6.858838490885968e-07, + "loss": 0.5163, + "mean_token_accuracy": 0.8347412943840027, + "num_tokens": 61856639.0, + "step": 1619 + }, + { + "epoch": 0.2060806513166264, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.970041275024414, + "learning_rate": 6.863077575243748e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8347486853599548, + "num_tokens": 61894313.0, + "step": 1620 + }, + { + "epoch": 0.2062078615952169, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.916172981262207, + "learning_rate": 6.867316659601525e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8497198224067688, + "num_tokens": 61930748.0, + "step": 1621 + }, + { + "epoch": 0.2063350718738074, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.6773788928985596, + "learning_rate": 6.871555743959304e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8482921719551086, + "num_tokens": 61972002.0, + "step": 1622 + }, + { + "epoch": 0.20646228215239792, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.356947422027588, + "learning_rate": 6.875794828317083e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8556475639343262, + "num_tokens": 62010141.0, + "step": 1623 + }, + { + "epoch": 0.20658949243098843, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.622045874595642, + "learning_rate": 6.880033912674862e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8586560487747192, + "num_tokens": 62055907.0, + "step": 1624 + }, + { + "epoch": 0.20671670270957893, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.9083995819091797, + "learning_rate": 6.884272997032641e-07, + "loss": 0.5487, + "mean_token_accuracy": 0.8251575231552124, + "num_tokens": 62095885.0, + "step": 1625 + }, + { + "epoch": 0.20684391298816945, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.7709543704986572, + "learning_rate": 6.88851208139042e-07, + "loss": 0.4418, + "mean_token_accuracy": 0.8607171177864075, + "num_tokens": 62130497.0, + "step": 1626 + }, + { + "epoch": 0.20697112326675995, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.4245636463165283, + "learning_rate": 6.892751165748198e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.838898777961731, + "num_tokens": 62170296.0, + "step": 1627 + }, + { + "epoch": 0.20709833354535045, + "ewc_loss": 3.2335519790649414e-06, + "grad_norm": 1.782555103302002, + "learning_rate": 6.896990250105978e-07, + "loss": 0.441, + "mean_token_accuracy": 0.860555112361908, + "num_tokens": 62210168.0, + "step": 1628 + }, + { + "epoch": 0.20722554382394098, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.8993138074874878, + "learning_rate": 6.901229334463755e-07, + "loss": 0.463, + "mean_token_accuracy": 0.849716305732727, + "num_tokens": 62245251.0, + "step": 1629 + }, + { + "epoch": 0.20735275410253148, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.0393178462982178, + "learning_rate": 6.905468418821534e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8562526702880859, + "num_tokens": 62286124.0, + "step": 1630 + }, + { + "epoch": 0.20747996438112198, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.8289192914962769, + "learning_rate": 6.909707503179313e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8466898202896118, + "num_tokens": 62325604.0, + "step": 1631 + }, + { + "epoch": 0.2076071746597125, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.9156352281570435, + "learning_rate": 6.913946587537091e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8509905338287354, + "num_tokens": 62364467.0, + "step": 1632 + }, + { + "epoch": 0.207734384938303, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.005755662918091, + "learning_rate": 6.918185671894871e-07, + "loss": 0.5378, + "mean_token_accuracy": 0.8288277387619019, + "num_tokens": 62402075.0, + "step": 1633 + }, + { + "epoch": 0.2078615952168935, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.835984230041504, + "learning_rate": 6.922424756252649e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8466806411743164, + "num_tokens": 62437802.0, + "step": 1634 + }, + { + "epoch": 0.20798880549548404, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 1.7628395557403564, + "learning_rate": 6.926663840610428e-07, + "loss": 0.4239, + "mean_token_accuracy": 0.8624932765960693, + "num_tokens": 62476176.0, + "step": 1635 + }, + { + "epoch": 0.20811601577407454, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 1.7354296445846558, + "learning_rate": 6.930902924968206e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8468654751777649, + "num_tokens": 62514378.0, + "step": 1636 + }, + { + "epoch": 0.20824322605266504, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 1.7675045728683472, + "learning_rate": 6.935142009325985e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.8428542613983154, + "num_tokens": 62555302.0, + "step": 1637 + }, + { + "epoch": 0.20837043633125557, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 1.809734582901001, + "learning_rate": 6.939381093683764e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8383960127830505, + "num_tokens": 62594356.0, + "step": 1638 + }, + { + "epoch": 0.20849764660984607, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.7970465421676636, + "learning_rate": 6.943620178041543e-07, + "loss": 0.4264, + "mean_token_accuracy": 0.8641021847724915, + "num_tokens": 62632903.0, + "step": 1639 + }, + { + "epoch": 0.2086248568884366, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.8365767002105713, + "learning_rate": 6.947859262399321e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8544932007789612, + "num_tokens": 62668979.0, + "step": 1640 + }, + { + "epoch": 0.2087520671670271, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.809741735458374, + "learning_rate": 6.952098346757101e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8513879179954529, + "num_tokens": 62705867.0, + "step": 1641 + }, + { + "epoch": 0.2088792774456176, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 1.7932040691375732, + "learning_rate": 6.956337431114879e-07, + "loss": 0.456, + "mean_token_accuracy": 0.8544238805770874, + "num_tokens": 62744559.0, + "step": 1642 + }, + { + "epoch": 0.20900648772420813, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 1.7351206541061401, + "learning_rate": 6.960576515472658e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.844823956489563, + "num_tokens": 62785221.0, + "step": 1643 + }, + { + "epoch": 0.20913369800279863, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.3527984619140625, + "learning_rate": 6.964815599830436e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8473075032234192, + "num_tokens": 62820095.0, + "step": 1644 + }, + { + "epoch": 0.20926090828138913, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 1.7516969442367554, + "learning_rate": 6.969054684188215e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8290310502052307, + "num_tokens": 62863414.0, + "step": 1645 + }, + { + "epoch": 0.20938811855997966, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.642904281616211, + "learning_rate": 6.973293768545994e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8592179417610168, + "num_tokens": 62904647.0, + "step": 1646 + }, + { + "epoch": 0.20951532883857016, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.7920302152633667, + "learning_rate": 6.977532852903773e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8481357097625732, + "num_tokens": 62949476.0, + "step": 1647 + }, + { + "epoch": 0.20964253911716066, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.8966517448425293, + "learning_rate": 6.981771937261551e-07, + "loss": 0.5222, + "mean_token_accuracy": 0.835007905960083, + "num_tokens": 62984784.0, + "step": 1648 + }, + { + "epoch": 0.2097697493957512, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.7805826663970947, + "learning_rate": 6.986011021619331e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8466126322746277, + "num_tokens": 63024395.0, + "step": 1649 + }, + { + "epoch": 0.2098969596743417, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 2.153369903564453, + "learning_rate": 6.990250105977109e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8441400527954102, + "num_tokens": 63061886.0, + "step": 1650 + }, + { + "epoch": 0.2100241699529322, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.8390682935714722, + "learning_rate": 6.994489190334886e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8506202697753906, + "num_tokens": 63104150.0, + "step": 1651 + }, + { + "epoch": 0.21015138023152272, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.8231340646743774, + "learning_rate": 6.998728274692666e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8625305891036987, + "num_tokens": 63144346.0, + "step": 1652 + }, + { + "epoch": 0.21027859051011322, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.7830586433410645, + "learning_rate": 7.002967359050444e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.8556773066520691, + "num_tokens": 63180042.0, + "step": 1653 + }, + { + "epoch": 0.21040580078870372, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.6804008483886719, + "learning_rate": 7.007206443408224e-07, + "loss": 0.4181, + "mean_token_accuracy": 0.8640393614768982, + "num_tokens": 63216570.0, + "step": 1654 + }, + { + "epoch": 0.21053301106729425, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.7644559144973755, + "learning_rate": 7.011445527766002e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8497470617294312, + "num_tokens": 63253457.0, + "step": 1655 + }, + { + "epoch": 0.21066022134588475, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.8316054344177246, + "learning_rate": 7.015684612123781e-07, + "loss": 0.4564, + "mean_token_accuracy": 0.8517932891845703, + "num_tokens": 63295430.0, + "step": 1656 + }, + { + "epoch": 0.21078743162447525, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.7949810028076172, + "learning_rate": 7.01992369648156e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.8480629920959473, + "num_tokens": 63340118.0, + "step": 1657 + }, + { + "epoch": 0.21091464190306577, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 2.0596728324890137, + "learning_rate": 7.024162780839339e-07, + "loss": 0.5094, + "mean_token_accuracy": 0.830598771572113, + "num_tokens": 63372037.0, + "step": 1658 + }, + { + "epoch": 0.21104185218165628, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.7291392087936401, + "learning_rate": 7.028401865197116e-07, + "loss": 0.4991, + "mean_token_accuracy": 0.8399325609207153, + "num_tokens": 63415508.0, + "step": 1659 + }, + { + "epoch": 0.21116906246024678, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.6482356786727905, + "learning_rate": 7.032640949554896e-07, + "loss": 0.4109, + "mean_token_accuracy": 0.8635121583938599, + "num_tokens": 63459320.0, + "step": 1660 + }, + { + "epoch": 0.2112962727388373, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.716124176979065, + "learning_rate": 7.036880033912674e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.839228630065918, + "num_tokens": 63499624.0, + "step": 1661 + }, + { + "epoch": 0.2114234830174278, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.7711822986602783, + "learning_rate": 7.041119118270454e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.850297212600708, + "num_tokens": 63536507.0, + "step": 1662 + }, + { + "epoch": 0.2115506932960183, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.905561923980713, + "learning_rate": 7.045358202628232e-07, + "loss": 0.504, + "mean_token_accuracy": 0.841090738773346, + "num_tokens": 63576483.0, + "step": 1663 + }, + { + "epoch": 0.21167790357460883, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.8856091499328613, + "learning_rate": 7.049597286986011e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.8394588232040405, + "num_tokens": 63615568.0, + "step": 1664 + }, + { + "epoch": 0.21180511385319933, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.9110901355743408, + "learning_rate": 7.05383637134379e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8540444374084473, + "num_tokens": 63650023.0, + "step": 1665 + }, + { + "epoch": 0.21193232413178986, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.788535475730896, + "learning_rate": 7.058075455701568e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8396928906440735, + "num_tokens": 63690229.0, + "step": 1666 + }, + { + "epoch": 0.21205953441038036, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.8193687200546265, + "learning_rate": 7.062314540059346e-07, + "loss": 0.433, + "mean_token_accuracy": 0.8633370399475098, + "num_tokens": 63728169.0, + "step": 1667 + }, + { + "epoch": 0.21218674468897086, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.9734580516815186, + "learning_rate": 7.066553624417126e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8515722751617432, + "num_tokens": 63762816.0, + "step": 1668 + }, + { + "epoch": 0.2123139549675614, + "ewc_loss": 3.293156623840332e-06, + "grad_norm": 1.873673677444458, + "learning_rate": 7.070792708774904e-07, + "loss": 0.4773, + "mean_token_accuracy": 0.846702516078949, + "num_tokens": 63804288.0, + "step": 1669 + }, + { + "epoch": 0.2124411652461519, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.7147479057312012, + "learning_rate": 7.075031793132684e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8444145917892456, + "num_tokens": 63851365.0, + "step": 1670 + }, + { + "epoch": 0.2125683755247424, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.8218915462493896, + "learning_rate": 7.079270877490462e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8543976545333862, + "num_tokens": 63886187.0, + "step": 1671 + }, + { + "epoch": 0.21269558580333292, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.8349180221557617, + "learning_rate": 7.08350996184824e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.8309491872787476, + "num_tokens": 63927187.0, + "step": 1672 + }, + { + "epoch": 0.21282279608192342, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.8212604522705078, + "learning_rate": 7.08774904620602e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8574343919754028, + "num_tokens": 63964088.0, + "step": 1673 + }, + { + "epoch": 0.21295000636051392, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.8703058958053589, + "learning_rate": 7.091988130563797e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.841748058795929, + "num_tokens": 64004931.0, + "step": 1674 + }, + { + "epoch": 0.21307721663910445, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.7366348505020142, + "learning_rate": 7.096227214921576e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8419787883758545, + "num_tokens": 64043966.0, + "step": 1675 + }, + { + "epoch": 0.21320442691769495, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.8689664602279663, + "learning_rate": 7.100466299279355e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.849830687046051, + "num_tokens": 64078886.0, + "step": 1676 + }, + { + "epoch": 0.21333163719628545, + "ewc_loss": 3.293156623840332e-06, + "grad_norm": 1.8192622661590576, + "learning_rate": 7.104705383637134e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.84687340259552, + "num_tokens": 64117291.0, + "step": 1677 + }, + { + "epoch": 0.21345884747487598, + "ewc_loss": 3.293156623840332e-06, + "grad_norm": 1.8262823820114136, + "learning_rate": 7.108944467994913e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8534203171730042, + "num_tokens": 64156369.0, + "step": 1678 + }, + { + "epoch": 0.21358605775346648, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.9633644819259644, + "learning_rate": 7.113183552352692e-07, + "loss": 0.5291, + "mean_token_accuracy": 0.8279298543930054, + "num_tokens": 64193646.0, + "step": 1679 + }, + { + "epoch": 0.21371326803205698, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.8293508291244507, + "learning_rate": 7.11742263671047e-07, + "loss": 0.4222, + "mean_token_accuracy": 0.8630033731460571, + "num_tokens": 64231933.0, + "step": 1680 + }, + { + "epoch": 0.2138404783106475, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 2.0364859104156494, + "learning_rate": 7.12166172106825e-07, + "loss": 0.5521, + "mean_token_accuracy": 0.8233211040496826, + "num_tokens": 64267418.0, + "step": 1681 + }, + { + "epoch": 0.213967688589238, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.8381887674331665, + "learning_rate": 7.125900805426027e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8592895269393921, + "num_tokens": 64302440.0, + "step": 1682 + }, + { + "epoch": 0.2140948988678285, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.8134639263153076, + "learning_rate": 7.130139889783806e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8494440913200378, + "num_tokens": 64338240.0, + "step": 1683 + }, + { + "epoch": 0.21422210914641904, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.9452098608016968, + "learning_rate": 7.134378974141585e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8492567539215088, + "num_tokens": 64375734.0, + "step": 1684 + }, + { + "epoch": 0.21434931942500954, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 1.995247483253479, + "learning_rate": 7.138618058499364e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8532652854919434, + "num_tokens": 64411401.0, + "step": 1685 + }, + { + "epoch": 0.21447652970360004, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.728763461112976, + "learning_rate": 7.142857142857143e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8611012697219849, + "num_tokens": 64454719.0, + "step": 1686 + }, + { + "epoch": 0.21460373998219057, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.8280634880065918, + "learning_rate": 7.147096227214922e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8541502952575684, + "num_tokens": 64495736.0, + "step": 1687 + }, + { + "epoch": 0.21473095026078107, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.7340351343154907, + "learning_rate": 7.1513353115727e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8576394319534302, + "num_tokens": 64532843.0, + "step": 1688 + }, + { + "epoch": 0.21485816053937157, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.9304816722869873, + "learning_rate": 7.155574395930479e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8422399759292603, + "num_tokens": 64568950.0, + "step": 1689 + }, + { + "epoch": 0.2149853708179621, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.8673111200332642, + "learning_rate": 7.159813480288257e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8490656614303589, + "num_tokens": 64606878.0, + "step": 1690 + }, + { + "epoch": 0.2151125810965526, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.8560279607772827, + "learning_rate": 7.164052564646035e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8605040311813354, + "num_tokens": 64643012.0, + "step": 1691 + }, + { + "epoch": 0.21523979137514312, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.8676260709762573, + "learning_rate": 7.168291649003815e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8578610420227051, + "num_tokens": 64684444.0, + "step": 1692 + }, + { + "epoch": 0.21536700165373363, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.0466387271881104, + "learning_rate": 7.172530733361593e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8435064554214478, + "num_tokens": 64722068.0, + "step": 1693 + }, + { + "epoch": 0.21549421193232413, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 5.066115856170654, + "learning_rate": 7.176769817719373e-07, + "loss": 0.411, + "mean_token_accuracy": 0.8723517060279846, + "num_tokens": 64761829.0, + "step": 1694 + }, + { + "epoch": 0.21562142221091465, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.834197998046875, + "learning_rate": 7.181008902077151e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.8529433012008667, + "num_tokens": 64799185.0, + "step": 1695 + }, + { + "epoch": 0.21574863248950515, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.970847487449646, + "learning_rate": 7.18524798643493e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8460603356361389, + "num_tokens": 64839306.0, + "step": 1696 + }, + { + "epoch": 0.21587584276809565, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.8156346082687378, + "learning_rate": 7.189487070792708e-07, + "loss": 0.4642, + "mean_token_accuracy": 0.8536819219589233, + "num_tokens": 64876011.0, + "step": 1697 + }, + { + "epoch": 0.21600305304668618, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 1.7310539484024048, + "learning_rate": 7.193726155150487e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8540621399879456, + "num_tokens": 64918510.0, + "step": 1698 + }, + { + "epoch": 0.21613026332527668, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 1.7175456285476685, + "learning_rate": 7.197965239508265e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8503475785255432, + "num_tokens": 64957156.0, + "step": 1699 + }, + { + "epoch": 0.21625747360386718, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 1.8622872829437256, + "learning_rate": 7.202204323866045e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8524333238601685, + "num_tokens": 64998761.0, + "step": 1700 + }, + { + "epoch": 0.2163846838824577, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 2.0444064140319824, + "learning_rate": 7.206443408223823e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8487025499343872, + "num_tokens": 65038849.0, + "step": 1701 + }, + { + "epoch": 0.2165118941610482, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 1.9218305349349976, + "learning_rate": 7.210682492581603e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8532785773277283, + "num_tokens": 65071130.0, + "step": 1702 + }, + { + "epoch": 0.2166391044396387, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 1.8211671113967896, + "learning_rate": 7.214921576939381e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.8396577835083008, + "num_tokens": 65111619.0, + "step": 1703 + }, + { + "epoch": 0.21676631471822924, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 2.055861711502075, + "learning_rate": 7.219160661297159e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8576036095619202, + "num_tokens": 65147537.0, + "step": 1704 + }, + { + "epoch": 0.21689352499681974, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.853085994720459, + "learning_rate": 7.223399745654938e-07, + "loss": 0.4768, + "mean_token_accuracy": 0.847900927066803, + "num_tokens": 65187254.0, + "step": 1705 + }, + { + "epoch": 0.21702073527541024, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.8760923147201538, + "learning_rate": 7.227638830012717e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8413019776344299, + "num_tokens": 65226130.0, + "step": 1706 + }, + { + "epoch": 0.21714794555400077, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.048208713531494, + "learning_rate": 7.231877914370495e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8574471473693848, + "num_tokens": 65261282.0, + "step": 1707 + }, + { + "epoch": 0.21727515583259127, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.9757150411605835, + "learning_rate": 7.236116998728275e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.838979959487915, + "num_tokens": 65294850.0, + "step": 1708 + }, + { + "epoch": 0.21740236611118177, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.993263602256775, + "learning_rate": 7.240356083086053e-07, + "loss": 0.3956, + "mean_token_accuracy": 0.8713945150375366, + "num_tokens": 65332323.0, + "step": 1709 + }, + { + "epoch": 0.2175295763897723, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.884325385093689, + "learning_rate": 7.244595167443833e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8457497358322144, + "num_tokens": 65376033.0, + "step": 1710 + }, + { + "epoch": 0.2176567866683628, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.7663880586624146, + "learning_rate": 7.248834251801611e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8562524318695068, + "num_tokens": 65417875.0, + "step": 1711 + }, + { + "epoch": 0.2177839969469533, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.8756648302078247, + "learning_rate": 7.253073336159388e-07, + "loss": 0.4315, + "mean_token_accuracy": 0.8610281944274902, + "num_tokens": 65456675.0, + "step": 1712 + }, + { + "epoch": 0.21791120722554383, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.9525401592254639, + "learning_rate": 7.257312420517168e-07, + "loss": 0.4221, + "mean_token_accuracy": 0.8627598881721497, + "num_tokens": 65487699.0, + "step": 1713 + }, + { + "epoch": 0.21803841750413433, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.8174878358840942, + "learning_rate": 7.261551504874946e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8504235744476318, + "num_tokens": 65526143.0, + "step": 1714 + }, + { + "epoch": 0.21816562778272486, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.7528798580169678, + "learning_rate": 7.265790589232725e-07, + "loss": 0.4473, + "mean_token_accuracy": 0.8578867316246033, + "num_tokens": 65562969.0, + "step": 1715 + }, + { + "epoch": 0.21829283806131536, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.701521396636963, + "learning_rate": 7.270029673590504e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.8478831052780151, + "num_tokens": 65605999.0, + "step": 1716 + }, + { + "epoch": 0.21842004833990586, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.8689463138580322, + "learning_rate": 7.274268757948283e-07, + "loss": 0.5239, + "mean_token_accuracy": 0.8346320390701294, + "num_tokens": 65644877.0, + "step": 1717 + }, + { + "epoch": 0.2185472586184964, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.8606077432632446, + "learning_rate": 7.278507842306062e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.8574233055114746, + "num_tokens": 65678823.0, + "step": 1718 + }, + { + "epoch": 0.2186744688970869, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.9723690748214722, + "learning_rate": 7.282746926663841e-07, + "loss": 0.5243, + "mean_token_accuracy": 0.8334839344024658, + "num_tokens": 65717185.0, + "step": 1719 + }, + { + "epoch": 0.2188016791756774, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.862960934638977, + "learning_rate": 7.286986011021618e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8569629788398743, + "num_tokens": 65754422.0, + "step": 1720 + }, + { + "epoch": 0.21892888945426792, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.9676390886306763, + "learning_rate": 7.291225095379398e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8484231233596802, + "num_tokens": 65799390.0, + "step": 1721 + }, + { + "epoch": 0.21905609973285842, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.052745819091797, + "learning_rate": 7.295464179737176e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8513375520706177, + "num_tokens": 65837566.0, + "step": 1722 + }, + { + "epoch": 0.21918331001144892, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 1.8598593473434448, + "learning_rate": 7.299703264094955e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8527386784553528, + "num_tokens": 65874537.0, + "step": 1723 + }, + { + "epoch": 0.21931052029003945, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 1.8110191822052002, + "learning_rate": 7.303942348452734e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8448469638824463, + "num_tokens": 65915616.0, + "step": 1724 + }, + { + "epoch": 0.21943773056862995, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 1.7244560718536377, + "learning_rate": 7.308181432810513e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8585513830184937, + "num_tokens": 65954263.0, + "step": 1725 + }, + { + "epoch": 0.21956494084722045, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 1.7643834352493286, + "learning_rate": 7.312420517168292e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.8487060070037842, + "num_tokens": 65992880.0, + "step": 1726 + }, + { + "epoch": 0.21969215112581097, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 1.9342056512832642, + "learning_rate": 7.31665960152607e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8426699042320251, + "num_tokens": 66033958.0, + "step": 1727 + }, + { + "epoch": 0.21981936140440148, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 1.913404107093811, + "learning_rate": 7.320898685883848e-07, + "loss": 0.4556, + "mean_token_accuracy": 0.8522659540176392, + "num_tokens": 66072448.0, + "step": 1728 + }, + { + "epoch": 0.21994657168299198, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 1.836801290512085, + "learning_rate": 7.325137770241628e-07, + "loss": 0.5663, + "mean_token_accuracy": 0.8223171234130859, + "num_tokens": 66112648.0, + "step": 1729 + }, + { + "epoch": 0.2200737819615825, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.6869773864746094, + "learning_rate": 7.329376854599406e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8484206795692444, + "num_tokens": 66154791.0, + "step": 1730 + }, + { + "epoch": 0.220200992240173, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.8821121454238892, + "learning_rate": 7.333615938957184e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8500759601593018, + "num_tokens": 66192450.0, + "step": 1731 + }, + { + "epoch": 0.2203282025187635, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.7746953964233398, + "learning_rate": 7.337855023314964e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8515914678573608, + "num_tokens": 66231173.0, + "step": 1732 + }, + { + "epoch": 0.22045541279735403, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.8187806606292725, + "learning_rate": 7.342094107672742e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8352612257003784, + "num_tokens": 66272956.0, + "step": 1733 + }, + { + "epoch": 0.22058262307594453, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.8509587049484253, + "learning_rate": 7.346333192030522e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.85486900806427, + "num_tokens": 66309450.0, + "step": 1734 + }, + { + "epoch": 0.22070983335453503, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 2.1162612438201904, + "learning_rate": 7.350572276388299e-07, + "loss": 0.518, + "mean_token_accuracy": 0.8350151181221008, + "num_tokens": 66350947.0, + "step": 1735 + }, + { + "epoch": 0.22083704363312556, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 2.2833704948425293, + "learning_rate": 7.354811360746078e-07, + "loss": 0.4756, + "mean_token_accuracy": 0.8465526103973389, + "num_tokens": 66388364.0, + "step": 1736 + }, + { + "epoch": 0.22096425391171606, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 2.396871566772461, + "learning_rate": 7.359050445103857e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8424524068832397, + "num_tokens": 66425029.0, + "step": 1737 + }, + { + "epoch": 0.22109146419030656, + "ewc_loss": 3.4123659133911133e-06, + "grad_norm": 1.9183108806610107, + "learning_rate": 7.363289529461636e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.852096438407898, + "num_tokens": 66463924.0, + "step": 1738 + }, + { + "epoch": 0.2212186744688971, + "ewc_loss": 3.4123659133911133e-06, + "grad_norm": 1.836287498474121, + "learning_rate": 7.367528613819415e-07, + "loss": 0.4214, + "mean_token_accuracy": 0.8639223575592041, + "num_tokens": 66504824.0, + "step": 1739 + }, + { + "epoch": 0.2213458847474876, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.8029811382293701, + "learning_rate": 7.371767698177194e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.8567449450492859, + "num_tokens": 66539308.0, + "step": 1740 + }, + { + "epoch": 0.22147309502607812, + "ewc_loss": 3.4123659133911133e-06, + "grad_norm": 1.8273452520370483, + "learning_rate": 7.376006782534972e-07, + "loss": 0.435, + "mean_token_accuracy": 0.8608046770095825, + "num_tokens": 66572785.0, + "step": 1741 + }, + { + "epoch": 0.22160030530466862, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 2.027631998062134, + "learning_rate": 7.380245866892751e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8544260859489441, + "num_tokens": 66607890.0, + "step": 1742 + }, + { + "epoch": 0.22172751558325912, + "ewc_loss": 3.4123659133911133e-06, + "grad_norm": 1.7927460670471191, + "learning_rate": 7.384484951250529e-07, + "loss": 0.4551, + "mean_token_accuracy": 0.8563891649246216, + "num_tokens": 66650553.0, + "step": 1743 + }, + { + "epoch": 0.22185472586184965, + "ewc_loss": 3.4123659133911133e-06, + "grad_norm": 1.8192455768585205, + "learning_rate": 7.388724035608308e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8572368621826172, + "num_tokens": 66685330.0, + "step": 1744 + }, + { + "epoch": 0.22198193614044015, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.9043691158294678, + "learning_rate": 7.392963119966087e-07, + "loss": 0.4348, + "mean_token_accuracy": 0.8585048317909241, + "num_tokens": 66716816.0, + "step": 1745 + }, + { + "epoch": 0.22210914641903065, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.901429295539856, + "learning_rate": 7.397202204323866e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8478533029556274, + "num_tokens": 66754030.0, + "step": 1746 + }, + { + "epoch": 0.22223635669762118, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.925015926361084, + "learning_rate": 7.401441288681645e-07, + "loss": 0.4368, + "mean_token_accuracy": 0.8591593503952026, + "num_tokens": 66793650.0, + "step": 1747 + }, + { + "epoch": 0.22236356697621168, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.7844481468200684, + "learning_rate": 7.405680373039424e-07, + "loss": 0.4225, + "mean_token_accuracy": 0.8672291040420532, + "num_tokens": 66833709.0, + "step": 1748 + }, + { + "epoch": 0.22249077725480218, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.9921927452087402, + "learning_rate": 7.409919457397202e-07, + "loss": 0.4867, + "mean_token_accuracy": 0.8427201509475708, + "num_tokens": 66866456.0, + "step": 1749 + }, + { + "epoch": 0.2226179875333927, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.8141275644302368, + "learning_rate": 7.414158541754981e-07, + "loss": 0.5156, + "mean_token_accuracy": 0.8334271311759949, + "num_tokens": 66905875.0, + "step": 1750 + }, + { + "epoch": 0.2227451978119832, + "ewc_loss": 3.4123659133911133e-06, + "grad_norm": 2.455092668533325, + "learning_rate": 7.418397626112759e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.857210636138916, + "num_tokens": 66936009.0, + "step": 1751 + }, + { + "epoch": 0.2228724080905737, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.832142949104309, + "learning_rate": 7.422636710470537e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8496477603912354, + "num_tokens": 66978111.0, + "step": 1752 + }, + { + "epoch": 0.22299961836916424, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.8584948778152466, + "learning_rate": 7.426875794828317e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.848598837852478, + "num_tokens": 67015938.0, + "step": 1753 + }, + { + "epoch": 0.22312682864775474, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.8676912784576416, + "learning_rate": 7.431114879186095e-07, + "loss": 0.5074, + "mean_token_accuracy": 0.8397715091705322, + "num_tokens": 67055456.0, + "step": 1754 + }, + { + "epoch": 0.22325403892634524, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.9066108465194702, + "learning_rate": 7.435353963543875e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8511576652526855, + "num_tokens": 67096298.0, + "step": 1755 + }, + { + "epoch": 0.22338124920493577, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.8453028202056885, + "learning_rate": 7.439593047901653e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.8641700744628906, + "num_tokens": 67135280.0, + "step": 1756 + }, + { + "epoch": 0.22350845948352627, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.918244481086731, + "learning_rate": 7.443832132259431e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8423277139663696, + "num_tokens": 67171936.0, + "step": 1757 + }, + { + "epoch": 0.22363566976211677, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.8447675704956055, + "learning_rate": 7.44807121661721e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8482155203819275, + "num_tokens": 67209854.0, + "step": 1758 + }, + { + "epoch": 0.2237628800407073, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 2.047001600265503, + "learning_rate": 7.452310300974989e-07, + "loss": 0.4289, + "mean_token_accuracy": 0.8571270108222961, + "num_tokens": 67244369.0, + "step": 1759 + }, + { + "epoch": 0.2238900903192978, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.8010523319244385, + "learning_rate": 7.456549385332767e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8394152522087097, + "num_tokens": 67283994.0, + "step": 1760 + }, + { + "epoch": 0.2240173005978883, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.7354958057403564, + "learning_rate": 7.460788469690547e-07, + "loss": 0.3924, + "mean_token_accuracy": 0.8698146939277649, + "num_tokens": 67323230.0, + "step": 1761 + }, + { + "epoch": 0.22414451087647883, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.924368143081665, + "learning_rate": 7.465027554048325e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8294099569320679, + "num_tokens": 67366198.0, + "step": 1762 + }, + { + "epoch": 0.22427172115506933, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.737062692642212, + "learning_rate": 7.469266638406105e-07, + "loss": 0.5279, + "mean_token_accuracy": 0.8343662023544312, + "num_tokens": 67410685.0, + "step": 1763 + }, + { + "epoch": 0.22439893143365983, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.8841358423233032, + "learning_rate": 7.473505722763883e-07, + "loss": 0.4512, + "mean_token_accuracy": 0.855843722820282, + "num_tokens": 67445818.0, + "step": 1764 + }, + { + "epoch": 0.22452614171225035, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.8828859329223633, + "learning_rate": 7.477744807121661e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.842321515083313, + "num_tokens": 67483873.0, + "step": 1765 + }, + { + "epoch": 0.22465335199084085, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 2.1022627353668213, + "learning_rate": 7.48198389147944e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8521501421928406, + "num_tokens": 67519524.0, + "step": 1766 + }, + { + "epoch": 0.22478056226943138, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 2.0435428619384766, + "learning_rate": 7.486222975837219e-07, + "loss": 0.456, + "mean_token_accuracy": 0.8509755730628967, + "num_tokens": 67560920.0, + "step": 1767 + }, + { + "epoch": 0.22490777254802188, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.9241483211517334, + "learning_rate": 7.490462060194997e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8582442998886108, + "num_tokens": 67594948.0, + "step": 1768 + }, + { + "epoch": 0.22503498282661238, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.9360847473144531, + "learning_rate": 7.494701144552777e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.836521327495575, + "num_tokens": 67628867.0, + "step": 1769 + }, + { + "epoch": 0.2251621931052029, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.9085861444473267, + "learning_rate": 7.498940228910555e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.847943902015686, + "num_tokens": 67669762.0, + "step": 1770 + }, + { + "epoch": 0.2252894033837934, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 2.1459720134735107, + "learning_rate": 7.503179313268335e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8488452434539795, + "num_tokens": 67704750.0, + "step": 1771 + }, + { + "epoch": 0.2254166136623839, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.770384669303894, + "learning_rate": 7.507418397626113e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8402584791183472, + "num_tokens": 67745181.0, + "step": 1772 + }, + { + "epoch": 0.22554382394097444, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.805572748184204, + "learning_rate": 7.51165748198389e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8429880142211914, + "num_tokens": 67785929.0, + "step": 1773 + }, + { + "epoch": 0.22567103421956494, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.817356824874878, + "learning_rate": 7.51589656634167e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.853261411190033, + "num_tokens": 67821887.0, + "step": 1774 + }, + { + "epoch": 0.22579824449815544, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.8068976402282715, + "learning_rate": 7.520135650699448e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8461359739303589, + "num_tokens": 67859608.0, + "step": 1775 + }, + { + "epoch": 0.22592545477674597, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.8672726154327393, + "learning_rate": 7.524374735057227e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8455032706260681, + "num_tokens": 67895287.0, + "step": 1776 + }, + { + "epoch": 0.22605266505533647, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.8655760288238525, + "learning_rate": 7.528613819415006e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8608630299568176, + "num_tokens": 67930358.0, + "step": 1777 + }, + { + "epoch": 0.22617987533392697, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.8831740617752075, + "learning_rate": 7.532852903772785e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8435105681419373, + "num_tokens": 67966112.0, + "step": 1778 + }, + { + "epoch": 0.2263070856125175, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 1.9309638738632202, + "learning_rate": 7.537091988130564e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.842293381690979, + "num_tokens": 68001297.0, + "step": 1779 + }, + { + "epoch": 0.226434295891108, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 1.9007993936538696, + "learning_rate": 7.541331072488342e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8484752178192139, + "num_tokens": 68037346.0, + "step": 1780 + }, + { + "epoch": 0.2265615061696985, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 1.970462441444397, + "learning_rate": 7.54557015684612e-07, + "loss": 0.473, + "mean_token_accuracy": 0.8467981815338135, + "num_tokens": 68076566.0, + "step": 1781 + }, + { + "epoch": 0.22668871644828903, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8124178647994995, + "learning_rate": 7.5498092412039e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8565570712089539, + "num_tokens": 68116834.0, + "step": 1782 + }, + { + "epoch": 0.22681592672687953, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.274836778640747, + "learning_rate": 7.554048325561678e-07, + "loss": 0.4235, + "mean_token_accuracy": 0.8649304509162903, + "num_tokens": 68155573.0, + "step": 1783 + }, + { + "epoch": 0.22694313700547003, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.023172378540039, + "learning_rate": 7.558287409919457e-07, + "loss": 0.5162, + "mean_token_accuracy": 0.8382802605628967, + "num_tokens": 68193378.0, + "step": 1784 + }, + { + "epoch": 0.22707034728406056, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.9328371286392212, + "learning_rate": 7.562526494277236e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8390576839447021, + "num_tokens": 68230183.0, + "step": 1785 + }, + { + "epoch": 0.22719755756265106, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.736036777496338, + "learning_rate": 7.566765578635015e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.8550195097923279, + "num_tokens": 68270526.0, + "step": 1786 + }, + { + "epoch": 0.22732476784124156, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 1.8003498315811157, + "learning_rate": 7.571004662992794e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.84804368019104, + "num_tokens": 68308690.0, + "step": 1787 + }, + { + "epoch": 0.2274519781198321, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 1.8354840278625488, + "learning_rate": 7.575243747350572e-07, + "loss": 0.5289, + "mean_token_accuracy": 0.834557056427002, + "num_tokens": 68348660.0, + "step": 1788 + }, + { + "epoch": 0.2275791883984226, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 1.8004353046417236, + "learning_rate": 7.57948283170835e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8510211706161499, + "num_tokens": 68389129.0, + "step": 1789 + }, + { + "epoch": 0.2277063986770131, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.0460147857666016, + "learning_rate": 7.58372191606613e-07, + "loss": 0.4705, + "mean_token_accuracy": 0.850678026676178, + "num_tokens": 68423234.0, + "step": 1790 + }, + { + "epoch": 0.22783360895560362, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.9872314929962158, + "learning_rate": 7.587961000423908e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8490136861801147, + "num_tokens": 68460834.0, + "step": 1791 + }, + { + "epoch": 0.22796081923419412, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.993038535118103, + "learning_rate": 7.592200084781686e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8384672999382019, + "num_tokens": 68495605.0, + "step": 1792 + }, + { + "epoch": 0.22808802951278465, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 1.8838121891021729, + "learning_rate": 7.596439169139466e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8492722511291504, + "num_tokens": 68530838.0, + "step": 1793 + }, + { + "epoch": 0.22821523979137515, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 1.8352264165878296, + "learning_rate": 7.600678253497244e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8547717928886414, + "num_tokens": 68571225.0, + "step": 1794 + }, + { + "epoch": 0.22834245006996565, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8397246599197388, + "learning_rate": 7.604917337855023e-07, + "loss": 0.4428, + "mean_token_accuracy": 0.859397292137146, + "num_tokens": 68608284.0, + "step": 1795 + }, + { + "epoch": 0.22846966034855618, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 1.8725169897079468, + "learning_rate": 7.609156422212801e-07, + "loss": 0.4258, + "mean_token_accuracy": 0.8624019622802734, + "num_tokens": 68645943.0, + "step": 1796 + }, + { + "epoch": 0.22859687062714668, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.7458913326263428, + "learning_rate": 7.61339550657058e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8560603260993958, + "num_tokens": 68684245.0, + "step": 1797 + }, + { + "epoch": 0.22872408090573718, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 2.0106942653656006, + "learning_rate": 7.617634590928359e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8545128107070923, + "num_tokens": 68717532.0, + "step": 1798 + }, + { + "epoch": 0.2288512911843277, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.8284661769866943, + "learning_rate": 7.621873675286138e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8483206033706665, + "num_tokens": 68759515.0, + "step": 1799 + }, + { + "epoch": 0.2289785014629182, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.8389809131622314, + "learning_rate": 7.626112759643916e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8443520665168762, + "num_tokens": 68796969.0, + "step": 1800 + }, + { + "epoch": 0.2291057117415087, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.9707012176513672, + "learning_rate": 7.630351844001696e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.849528968334198, + "num_tokens": 68831138.0, + "step": 1801 + }, + { + "epoch": 0.22923292202009923, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8185343742370605, + "learning_rate": 7.634590928359474e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8553991317749023, + "num_tokens": 68868741.0, + "step": 1802 + }, + { + "epoch": 0.22936013229868973, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8018218278884888, + "learning_rate": 7.638830012717253e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.8446770906448364, + "num_tokens": 68909191.0, + "step": 1803 + }, + { + "epoch": 0.22948734257728023, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.7932829856872559, + "learning_rate": 7.643069097075031e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.852932333946228, + "num_tokens": 68949261.0, + "step": 1804 + }, + { + "epoch": 0.22961455285587076, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.3623838424682617, + "learning_rate": 7.64730818143281e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.8581088781356812, + "num_tokens": 68990434.0, + "step": 1805 + }, + { + "epoch": 0.22974176313446126, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8756366968154907, + "learning_rate": 7.651547265790589e-07, + "loss": 0.5103, + "mean_token_accuracy": 0.8390411734580994, + "num_tokens": 69029482.0, + "step": 1806 + }, + { + "epoch": 0.22986897341305176, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8885326385498047, + "learning_rate": 7.655786350148368e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.843044102191925, + "num_tokens": 69069766.0, + "step": 1807 + }, + { + "epoch": 0.2299961836916423, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.7507121562957764, + "learning_rate": 7.660025434506146e-07, + "loss": 0.477, + "mean_token_accuracy": 0.8469835519790649, + "num_tokens": 69114592.0, + "step": 1808 + }, + { + "epoch": 0.2301233939702328, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.829047441482544, + "learning_rate": 7.664264518863926e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8519379496574402, + "num_tokens": 69152599.0, + "step": 1809 + }, + { + "epoch": 0.2302506042488233, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.9490776062011719, + "learning_rate": 7.668503603221704e-07, + "loss": 0.5198, + "mean_token_accuracy": 0.8320410251617432, + "num_tokens": 69193157.0, + "step": 1810 + }, + { + "epoch": 0.23037781452741382, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8779253959655762, + "learning_rate": 7.672742687579483e-07, + "loss": 0.4295, + "mean_token_accuracy": 0.8597649335861206, + "num_tokens": 69230681.0, + "step": 1811 + }, + { + "epoch": 0.23050502480600432, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8769607543945312, + "learning_rate": 7.676981771937261e-07, + "loss": 0.4103, + "mean_token_accuracy": 0.8640447854995728, + "num_tokens": 69261292.0, + "step": 1812 + }, + { + "epoch": 0.23063223508459482, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.7613060474395752, + "learning_rate": 7.681220856295039e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.858905553817749, + "num_tokens": 69301795.0, + "step": 1813 + }, + { + "epoch": 0.23075944536318535, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.9676536321640015, + "learning_rate": 7.685459940652819e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8473803400993347, + "num_tokens": 69337038.0, + "step": 1814 + }, + { + "epoch": 0.23088665564177585, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.7218036651611328, + "learning_rate": 7.689699025010597e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8512433767318726, + "num_tokens": 69379871.0, + "step": 1815 + }, + { + "epoch": 0.23101386592036638, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.8242108821868896, + "learning_rate": 7.693938109368376e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8446401357650757, + "num_tokens": 69418021.0, + "step": 1816 + }, + { + "epoch": 0.23114107619895688, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.8657318353652954, + "learning_rate": 7.698177193726155e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8501391410827637, + "num_tokens": 69455574.0, + "step": 1817 + }, + { + "epoch": 0.23126828647754738, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.9150232076644897, + "learning_rate": 7.702416278083933e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8458940386772156, + "num_tokens": 69489822.0, + "step": 1818 + }, + { + "epoch": 0.2313954967561379, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.7397228479385376, + "learning_rate": 7.706655362441712e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8525846600532532, + "num_tokens": 69532821.0, + "step": 1819 + }, + { + "epoch": 0.2315227070347284, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.9231088161468506, + "learning_rate": 7.710894446799491e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8566944003105164, + "num_tokens": 69565674.0, + "step": 1820 + }, + { + "epoch": 0.2316499173133189, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.8789407014846802, + "learning_rate": 7.715133531157269e-07, + "loss": 0.4412, + "mean_token_accuracy": 0.8604490756988525, + "num_tokens": 69606118.0, + "step": 1821 + }, + { + "epoch": 0.23177712759190944, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.7856422662734985, + "learning_rate": 7.719372615515049e-07, + "loss": 0.509, + "mean_token_accuracy": 0.837689220905304, + "num_tokens": 69650102.0, + "step": 1822 + }, + { + "epoch": 0.23190433787049994, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.7447060346603394, + "learning_rate": 7.723611699872827e-07, + "loss": 0.4539, + "mean_token_accuracy": 0.8528926968574524, + "num_tokens": 69693440.0, + "step": 1823 + }, + { + "epoch": 0.23203154814909044, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.7714022397994995, + "learning_rate": 7.727850784230606e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8445830941200256, + "num_tokens": 69732651.0, + "step": 1824 + }, + { + "epoch": 0.23215875842768097, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.071061372756958, + "learning_rate": 7.732089868588385e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8455911874771118, + "num_tokens": 69769915.0, + "step": 1825 + }, + { + "epoch": 0.23228596870627147, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 1.7525913715362549, + "learning_rate": 7.736328952946163e-07, + "loss": 0.4127, + "mean_token_accuracy": 0.8681983947753906, + "num_tokens": 69813943.0, + "step": 1826 + }, + { + "epoch": 0.23241317898486197, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.9415775537490845, + "learning_rate": 7.740568037303942e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.856366753578186, + "num_tokens": 69847039.0, + "step": 1827 + }, + { + "epoch": 0.2325403892634525, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.042311191558838, + "learning_rate": 7.744807121661721e-07, + "loss": 0.4064, + "mean_token_accuracy": 0.8715440034866333, + "num_tokens": 69884588.0, + "step": 1828 + }, + { + "epoch": 0.232667599542043, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.9657573699951172, + "learning_rate": 7.749046206019499e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8499504923820496, + "num_tokens": 69925552.0, + "step": 1829 + }, + { + "epoch": 0.2327948098206335, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 2.571589469909668, + "learning_rate": 7.753285290377279e-07, + "loss": 0.406, + "mean_token_accuracy": 0.867947518825531, + "num_tokens": 69961776.0, + "step": 1830 + }, + { + "epoch": 0.23292202009922403, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.8899648189544678, + "learning_rate": 7.757524374735057e-07, + "loss": 0.512, + "mean_token_accuracy": 0.8352724313735962, + "num_tokens": 69999775.0, + "step": 1831 + }, + { + "epoch": 0.23304923037781453, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.74117910861969, + "learning_rate": 7.761763459092836e-07, + "loss": 0.4222, + "mean_token_accuracy": 0.8661637306213379, + "num_tokens": 70037294.0, + "step": 1832 + }, + { + "epoch": 0.23317644065640503, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.8333243131637573, + "learning_rate": 7.766002543450614e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8537759184837341, + "num_tokens": 70075014.0, + "step": 1833 + }, + { + "epoch": 0.23330365093499555, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.9476914405822754, + "learning_rate": 7.770241627808392e-07, + "loss": 0.4146, + "mean_token_accuracy": 0.8657232522964478, + "num_tokens": 70112186.0, + "step": 1834 + }, + { + "epoch": 0.23343086121358606, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 1.8848422765731812, + "learning_rate": 7.774480712166172e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8430235385894775, + "num_tokens": 70153285.0, + "step": 1835 + }, + { + "epoch": 0.23355807149217656, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.964290976524353, + "learning_rate": 7.77871979652395e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8482593297958374, + "num_tokens": 70193382.0, + "step": 1836 + }, + { + "epoch": 0.23368528177076708, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 2.033871650695801, + "learning_rate": 7.782958880881729e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.8498268127441406, + "num_tokens": 70227279.0, + "step": 1837 + }, + { + "epoch": 0.23381249204935758, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.8885735273361206, + "learning_rate": 7.787197965239508e-07, + "loss": 0.4265, + "mean_token_accuracy": 0.8596215844154358, + "num_tokens": 70264395.0, + "step": 1838 + }, + { + "epoch": 0.23393970232794808, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 2.128425359725952, + "learning_rate": 7.791437049597287e-07, + "loss": 0.5393, + "mean_token_accuracy": 0.8286556005477905, + "num_tokens": 70295016.0, + "step": 1839 + }, + { + "epoch": 0.2340669126065386, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.889899730682373, + "learning_rate": 7.795676133955065e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8463131785392761, + "num_tokens": 70332774.0, + "step": 1840 + }, + { + "epoch": 0.2341941228851291, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.9308351278305054, + "learning_rate": 7.799915218312844e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8500071167945862, + "num_tokens": 70366880.0, + "step": 1841 + }, + { + "epoch": 0.23432133316371964, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.8671828508377075, + "learning_rate": 7.804154302670622e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8560004234313965, + "num_tokens": 70405235.0, + "step": 1842 + }, + { + "epoch": 0.23444854344231014, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 1.9505040645599365, + "learning_rate": 7.808393387028402e-07, + "loss": 0.4535, + "mean_token_accuracy": 0.8531944751739502, + "num_tokens": 70446503.0, + "step": 1843 + }, + { + "epoch": 0.23457575372090064, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.8785463571548462, + "learning_rate": 7.81263247138618e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.838036060333252, + "num_tokens": 70483514.0, + "step": 1844 + }, + { + "epoch": 0.23470296399949117, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.9361069202423096, + "learning_rate": 7.816871555743959e-07, + "loss": 0.4665, + "mean_token_accuracy": 0.8505222797393799, + "num_tokens": 70520944.0, + "step": 1845 + }, + { + "epoch": 0.23483017427808167, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.9258002042770386, + "learning_rate": 7.821110640101738e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.840024471282959, + "num_tokens": 70564399.0, + "step": 1846 + }, + { + "epoch": 0.23495738455667217, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.8802558183670044, + "learning_rate": 7.825349724459517e-07, + "loss": 0.464, + "mean_token_accuracy": 0.852621853351593, + "num_tokens": 70604000.0, + "step": 1847 + }, + { + "epoch": 0.2350845948352627, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 2.010474920272827, + "learning_rate": 7.829588808817294e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8520035147666931, + "num_tokens": 70645133.0, + "step": 1848 + }, + { + "epoch": 0.2352118051138532, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 2.0419809818267822, + "learning_rate": 7.833827893175074e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8378694653511047, + "num_tokens": 70675824.0, + "step": 1849 + }, + { + "epoch": 0.2353390153924437, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.905835509300232, + "learning_rate": 7.838066977532852e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8533955812454224, + "num_tokens": 70714103.0, + "step": 1850 + }, + { + "epoch": 0.23546622567103423, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.9221001863479614, + "learning_rate": 7.842306061890632e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8403782844543457, + "num_tokens": 70754265.0, + "step": 1851 + }, + { + "epoch": 0.23559343594962473, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.9974831342697144, + "learning_rate": 7.84654514624841e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8455115556716919, + "num_tokens": 70791368.0, + "step": 1852 + }, + { + "epoch": 0.23572064622821523, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.9724221229553223, + "learning_rate": 7.850784230606188e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8396218419075012, + "num_tokens": 70827397.0, + "step": 1853 + }, + { + "epoch": 0.23584785650680576, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.6626262664794922, + "learning_rate": 7.855023314963968e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8465306162834167, + "num_tokens": 70872503.0, + "step": 1854 + }, + { + "epoch": 0.23597506678539626, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.966822624206543, + "learning_rate": 7.859262399321746e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8434672355651855, + "num_tokens": 70914376.0, + "step": 1855 + }, + { + "epoch": 0.23610227706398676, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.893795132637024, + "learning_rate": 7.863501483679524e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8568835854530334, + "num_tokens": 70954004.0, + "step": 1856 + }, + { + "epoch": 0.2362294873425773, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.8136131763458252, + "learning_rate": 7.867740568037303e-07, + "loss": 0.4422, + "mean_token_accuracy": 0.8542437553405762, + "num_tokens": 70991765.0, + "step": 1857 + }, + { + "epoch": 0.2363566976211678, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.9272024631500244, + "learning_rate": 7.871979652395082e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.846047043800354, + "num_tokens": 71025478.0, + "step": 1858 + }, + { + "epoch": 0.2364839078997583, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.7223085165023804, + "learning_rate": 7.876218736752861e-07, + "loss": 0.4265, + "mean_token_accuracy": 0.8654080033302307, + "num_tokens": 71067287.0, + "step": 1859 + }, + { + "epoch": 0.23661111817834882, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.9104704856872559, + "learning_rate": 7.88045782111064e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8425782918930054, + "num_tokens": 71108095.0, + "step": 1860 + }, + { + "epoch": 0.23673832845693932, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.7276482582092285, + "learning_rate": 7.884696905468418e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8614383935928345, + "num_tokens": 71150297.0, + "step": 1861 + }, + { + "epoch": 0.23686553873552982, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.8333158493041992, + "learning_rate": 7.888935989826198e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8432343006134033, + "num_tokens": 71189205.0, + "step": 1862 + }, + { + "epoch": 0.23699274901412035, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.9886672496795654, + "learning_rate": 7.893175074183976e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8504818677902222, + "num_tokens": 71228141.0, + "step": 1863 + }, + { + "epoch": 0.23711995929271085, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.8288496732711792, + "learning_rate": 7.897414158541754e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8506146669387817, + "num_tokens": 71267854.0, + "step": 1864 + }, + { + "epoch": 0.23724716957130135, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.921157717704773, + "learning_rate": 7.901653242899533e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8450393676757812, + "num_tokens": 71305030.0, + "step": 1865 + }, + { + "epoch": 0.23737437984989188, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 2.0043084621429443, + "learning_rate": 7.905892327257312e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8514896631240845, + "num_tokens": 71342202.0, + "step": 1866 + }, + { + "epoch": 0.23750159012848238, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 2.040180206298828, + "learning_rate": 7.910131411615091e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8551597595214844, + "num_tokens": 71379702.0, + "step": 1867 + }, + { + "epoch": 0.2376288004070729, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.8600037097930908, + "learning_rate": 7.91437049597287e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8512955904006958, + "num_tokens": 71417785.0, + "step": 1868 + }, + { + "epoch": 0.2377560106856634, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.8317195177078247, + "learning_rate": 7.918609580330648e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.839322030544281, + "num_tokens": 71461358.0, + "step": 1869 + }, + { + "epoch": 0.2378832209642539, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.8536509275436401, + "learning_rate": 7.922848664688428e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8419709205627441, + "num_tokens": 71500059.0, + "step": 1870 + }, + { + "epoch": 0.23801043124284443, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.8008805513381958, + "learning_rate": 7.927087749046205e-07, + "loss": 0.413, + "mean_token_accuracy": 0.8672471046447754, + "num_tokens": 71537043.0, + "step": 1871 + }, + { + "epoch": 0.23813764152143493, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.8279383182525635, + "learning_rate": 7.931326833403983e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8554545640945435, + "num_tokens": 71570671.0, + "step": 1872 + }, + { + "epoch": 0.23826485180002543, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.9375033378601074, + "learning_rate": 7.935565917761763e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8542265892028809, + "num_tokens": 71614813.0, + "step": 1873 + }, + { + "epoch": 0.23839206207861596, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.9781800508499146, + "learning_rate": 7.939805002119541e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8479686975479126, + "num_tokens": 71650441.0, + "step": 1874 + }, + { + "epoch": 0.23851927235720646, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.101473093032837, + "learning_rate": 7.944044086477321e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.847200870513916, + "num_tokens": 71690425.0, + "step": 1875 + }, + { + "epoch": 0.23864648263579696, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.7874609231948853, + "learning_rate": 7.948283170835099e-07, + "loss": 0.4253, + "mean_token_accuracy": 0.8613770008087158, + "num_tokens": 71727349.0, + "step": 1876 + }, + { + "epoch": 0.2387736929143875, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.784091591835022, + "learning_rate": 7.952522255192878e-07, + "loss": 0.3952, + "mean_token_accuracy": 0.8711283206939697, + "num_tokens": 71770386.0, + "step": 1877 + }, + { + "epoch": 0.238900903192978, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.7957966327667236, + "learning_rate": 7.956761339550657e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8640198707580566, + "num_tokens": 71808946.0, + "step": 1878 + }, + { + "epoch": 0.2390281134715685, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.937882661819458, + "learning_rate": 7.961000423908435e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.856641411781311, + "num_tokens": 71848605.0, + "step": 1879 + }, + { + "epoch": 0.23915532375015902, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.7965809106826782, + "learning_rate": 7.965239508266214e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.856791079044342, + "num_tokens": 71886771.0, + "step": 1880 + }, + { + "epoch": 0.23928253402874952, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.8288202285766602, + "learning_rate": 7.969478592623993e-07, + "loss": 0.4219, + "mean_token_accuracy": 0.8621978759765625, + "num_tokens": 71924905.0, + "step": 1881 + }, + { + "epoch": 0.23940974430734002, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.9119446277618408, + "learning_rate": 7.973717676981771e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8573741316795349, + "num_tokens": 71964513.0, + "step": 1882 + }, + { + "epoch": 0.23953695458593055, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.789353847503662, + "learning_rate": 7.977956761339551e-07, + "loss": 0.4219, + "mean_token_accuracy": 0.8636224269866943, + "num_tokens": 72000918.0, + "step": 1883 + }, + { + "epoch": 0.23966416486452105, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.8275551795959473, + "learning_rate": 7.982195845697329e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.8249202966690063, + "num_tokens": 72040840.0, + "step": 1884 + }, + { + "epoch": 0.23979137514311155, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.9165507555007935, + "learning_rate": 7.986434930055108e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8618969321250916, + "num_tokens": 72076757.0, + "step": 1885 + }, + { + "epoch": 0.23991858542170208, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.8034346103668213, + "learning_rate": 7.990674014412886e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8489305973052979, + "num_tokens": 72120760.0, + "step": 1886 + }, + { + "epoch": 0.24004579570029258, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 1.9042590856552124, + "learning_rate": 7.994913098770665e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8499011993408203, + "num_tokens": 72159985.0, + "step": 1887 + }, + { + "epoch": 0.24017300597888308, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 1.977861762046814, + "learning_rate": 7.999152183128444e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8388932943344116, + "num_tokens": 72196377.0, + "step": 1888 + }, + { + "epoch": 0.2403002162574736, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.129432201385498, + "learning_rate": 8.003391267486223e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8569918870925903, + "num_tokens": 72236585.0, + "step": 1889 + }, + { + "epoch": 0.2404274265360641, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.140164613723755, + "learning_rate": 8.007630351844001e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.8521590232849121, + "num_tokens": 72268450.0, + "step": 1890 + }, + { + "epoch": 0.24055463681465464, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.013474941253662, + "learning_rate": 8.011869436201781e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8567001223564148, + "num_tokens": 72304092.0, + "step": 1891 + }, + { + "epoch": 0.24068184709324514, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.832201361656189, + "learning_rate": 8.016108520559559e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.8485215306282043, + "num_tokens": 72347312.0, + "step": 1892 + }, + { + "epoch": 0.24080905737183564, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.073028326034546, + "learning_rate": 8.020347604917338e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.8597773313522339, + "num_tokens": 72382135.0, + "step": 1893 + }, + { + "epoch": 0.24093626765042617, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.4046735763549805, + "learning_rate": 8.024586689275116e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8520802855491638, + "num_tokens": 72421526.0, + "step": 1894 + }, + { + "epoch": 0.24106347792901667, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.0619704723358154, + "learning_rate": 8.028825773632894e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.8546571135520935, + "num_tokens": 72458592.0, + "step": 1895 + }, + { + "epoch": 0.24119068820760717, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.0178024768829346, + "learning_rate": 8.033064857990674e-07, + "loss": 0.3934, + "mean_token_accuracy": 0.872546374797821, + "num_tokens": 72494734.0, + "step": 1896 + }, + { + "epoch": 0.2413178984861977, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.0653226375579834, + "learning_rate": 8.037303942348452e-07, + "loss": 0.4427, + "mean_token_accuracy": 0.85874342918396, + "num_tokens": 72531560.0, + "step": 1897 + }, + { + "epoch": 0.2414451087647882, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.913697361946106, + "learning_rate": 8.041543026706231e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8594706058502197, + "num_tokens": 72570144.0, + "step": 1898 + }, + { + "epoch": 0.2415723190433787, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8796955347061157, + "learning_rate": 8.04578211106401e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8540477156639099, + "num_tokens": 72606738.0, + "step": 1899 + }, + { + "epoch": 0.24169952932196923, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8938654661178589, + "learning_rate": 8.050021195421789e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.842726469039917, + "num_tokens": 72646230.0, + "step": 1900 + }, + { + "epoch": 0.24182673960055973, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.997828722000122, + "learning_rate": 8.054260279779567e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8494062423706055, + "num_tokens": 72681811.0, + "step": 1901 + }, + { + "epoch": 0.24195394987915023, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8664557933807373, + "learning_rate": 8.058499364137346e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.8416304588317871, + "num_tokens": 72721805.0, + "step": 1902 + }, + { + "epoch": 0.24208116015774075, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.7632097005844116, + "learning_rate": 8.062738448495124e-07, + "loss": 0.4132, + "mean_token_accuracy": 0.8660942316055298, + "num_tokens": 72763142.0, + "step": 1903 + }, + { + "epoch": 0.24220837043633126, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.930395245552063, + "learning_rate": 8.066977532852904e-07, + "loss": 0.4034, + "mean_token_accuracy": 0.8740180134773254, + "num_tokens": 72802047.0, + "step": 1904 + }, + { + "epoch": 0.24233558071492176, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.121213912963867, + "learning_rate": 8.071216617210682e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.846656322479248, + "num_tokens": 72834470.0, + "step": 1905 + }, + { + "epoch": 0.24246279099351228, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.7624783515930176, + "learning_rate": 8.075455701568461e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8555359840393066, + "num_tokens": 72873101.0, + "step": 1906 + }, + { + "epoch": 0.24259000127210278, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.7508621215820312, + "learning_rate": 8.07969478592624e-07, + "loss": 0.4324, + "mean_token_accuracy": 0.8631672859191895, + "num_tokens": 72915504.0, + "step": 1907 + }, + { + "epoch": 0.24271721155069328, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.7586767673492432, + "learning_rate": 8.083933870284019e-07, + "loss": 0.4055, + "mean_token_accuracy": 0.8704829812049866, + "num_tokens": 72947179.0, + "step": 1908 + }, + { + "epoch": 0.2428444218292838, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8194308280944824, + "learning_rate": 8.088172954641796e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8567978143692017, + "num_tokens": 72983313.0, + "step": 1909 + }, + { + "epoch": 0.2429716321078743, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.9403531551361084, + "learning_rate": 8.092412038999576e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8428784608840942, + "num_tokens": 73014218.0, + "step": 1910 + }, + { + "epoch": 0.24309884238646481, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.11555814743042, + "learning_rate": 8.096651123357354e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8361762166023254, + "num_tokens": 73048282.0, + "step": 1911 + }, + { + "epoch": 0.24322605266505534, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.9648869037628174, + "learning_rate": 8.100890207715134e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8430757522583008, + "num_tokens": 73084390.0, + "step": 1912 + }, + { + "epoch": 0.24335326294364584, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.9308006763458252, + "learning_rate": 8.105129292072912e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8533855676651001, + "num_tokens": 73123296.0, + "step": 1913 + }, + { + "epoch": 0.24348047322223634, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.5552926063537598, + "learning_rate": 8.10936837643069e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8410013914108276, + "num_tokens": 73157533.0, + "step": 1914 + }, + { + "epoch": 0.24360768350082687, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.9641777276992798, + "learning_rate": 8.11360746078847e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8580296039581299, + "num_tokens": 73195314.0, + "step": 1915 + }, + { + "epoch": 0.24373489377941737, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8189318180084229, + "learning_rate": 8.117846545146248e-07, + "loss": 0.4063, + "mean_token_accuracy": 0.8664968609809875, + "num_tokens": 73236763.0, + "step": 1916 + }, + { + "epoch": 0.2438621040580079, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.110027313232422, + "learning_rate": 8.122085629504026e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8401967287063599, + "num_tokens": 73276719.0, + "step": 1917 + }, + { + "epoch": 0.2439893143365984, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8994005918502808, + "learning_rate": 8.126324713861805e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8541333675384521, + "num_tokens": 73318594.0, + "step": 1918 + }, + { + "epoch": 0.2441165246151889, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.829580545425415, + "learning_rate": 8.130563798219584e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.8614983558654785, + "num_tokens": 73358760.0, + "step": 1919 + }, + { + "epoch": 0.24424373489377943, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.9538902044296265, + "learning_rate": 8.134802882577363e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8510396480560303, + "num_tokens": 73398623.0, + "step": 1920 + }, + { + "epoch": 0.24437094517236993, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.6995917558670044, + "learning_rate": 8.139041966935142e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.853313684463501, + "num_tokens": 73446369.0, + "step": 1921 + }, + { + "epoch": 0.24449815545096043, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.018094301223755, + "learning_rate": 8.14328105129292e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8391163349151611, + "num_tokens": 73483152.0, + "step": 1922 + }, + { + "epoch": 0.24462536572955096, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.067847490310669, + "learning_rate": 8.1475201356507e-07, + "loss": 0.473, + "mean_token_accuracy": 0.8485972881317139, + "num_tokens": 73519179.0, + "step": 1923 + }, + { + "epoch": 0.24475257600814146, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8192030191421509, + "learning_rate": 8.151759220008477e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8550868034362793, + "num_tokens": 73561947.0, + "step": 1924 + }, + { + "epoch": 0.24487978628673196, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.0289156436920166, + "learning_rate": 8.155998304366256e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8563916683197021, + "num_tokens": 73595841.0, + "step": 1925 + }, + { + "epoch": 0.2450069965653225, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.139038562774658, + "learning_rate": 8.160237388724035e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8429504632949829, + "num_tokens": 73629716.0, + "step": 1926 + }, + { + "epoch": 0.245134206843913, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.9850523471832275, + "learning_rate": 8.164476473081814e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8499323129653931, + "num_tokens": 73667285.0, + "step": 1927 + }, + { + "epoch": 0.2452614171225035, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.842943549156189, + "learning_rate": 8.168715557439593e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.8652803301811218, + "num_tokens": 73708059.0, + "step": 1928 + }, + { + "epoch": 0.24538862740109402, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8318438529968262, + "learning_rate": 8.172954641797372e-07, + "loss": 0.4319, + "mean_token_accuracy": 0.8612459897994995, + "num_tokens": 73748553.0, + "step": 1929 + }, + { + "epoch": 0.24551583767968452, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8069337606430054, + "learning_rate": 8.17719372615515e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.8313761353492737, + "num_tokens": 73796295.0, + "step": 1930 + }, + { + "epoch": 0.24564304795827502, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.9809467792510986, + "learning_rate": 8.18143281051293e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8395555019378662, + "num_tokens": 73835960.0, + "step": 1931 + }, + { + "epoch": 0.24577025823686555, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.9045096635818481, + "learning_rate": 8.185671894870707e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8342218399047852, + "num_tokens": 73875302.0, + "step": 1932 + }, + { + "epoch": 0.24589746851545605, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.8793418407440186, + "learning_rate": 8.189910979228485e-07, + "loss": 0.4176, + "mean_token_accuracy": 0.8634805679321289, + "num_tokens": 73907703.0, + "step": 1933 + }, + { + "epoch": 0.24602467879404655, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.0689234733581543, + "learning_rate": 8.194150063586265e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8395215272903442, + "num_tokens": 73945449.0, + "step": 1934 + }, + { + "epoch": 0.24615188907263708, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 2.06299090385437, + "learning_rate": 8.198389147944043e-07, + "loss": 0.4982, + "mean_token_accuracy": 0.838287353515625, + "num_tokens": 73982461.0, + "step": 1935 + }, + { + "epoch": 0.24627909935122758, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.8822177648544312, + "learning_rate": 8.202628232301823e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8515669107437134, + "num_tokens": 74018252.0, + "step": 1936 + }, + { + "epoch": 0.24640630962981808, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 2.0402050018310547, + "learning_rate": 8.206867316659601e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8407717943191528, + "num_tokens": 74065843.0, + "step": 1937 + }, + { + "epoch": 0.2465335199084086, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 2.0601794719696045, + "learning_rate": 8.21110640101738e-07, + "loss": 0.3856, + "mean_token_accuracy": 0.8724730014801025, + "num_tokens": 74106389.0, + "step": 1938 + }, + { + "epoch": 0.2466607301869991, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.8037488460540771, + "learning_rate": 8.215345485375159e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8393286466598511, + "num_tokens": 74150786.0, + "step": 1939 + }, + { + "epoch": 0.2467879404655896, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.7999056577682495, + "learning_rate": 8.219584569732937e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8451117873191833, + "num_tokens": 74190474.0, + "step": 1940 + }, + { + "epoch": 0.24691515074418013, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.9245867729187012, + "learning_rate": 8.223823654090715e-07, + "loss": 0.4258, + "mean_token_accuracy": 0.8635725975036621, + "num_tokens": 74226437.0, + "step": 1941 + }, + { + "epoch": 0.24704236102277063, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 2.0155270099639893, + "learning_rate": 8.228062738448495e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8526831269264221, + "num_tokens": 74267292.0, + "step": 1942 + }, + { + "epoch": 0.24716957130136116, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.8832045793533325, + "learning_rate": 8.232301822806273e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8478943109512329, + "num_tokens": 74306738.0, + "step": 1943 + }, + { + "epoch": 0.24729678157995166, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.8410382270812988, + "learning_rate": 8.236540907164053e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8522859215736389, + "num_tokens": 74343151.0, + "step": 1944 + }, + { + "epoch": 0.24742399185854216, + "ewc_loss": 3.7401914596557617e-06, + "grad_norm": 1.8278343677520752, + "learning_rate": 8.240779991521831e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.8562718033790588, + "num_tokens": 74382755.0, + "step": 1945 + }, + { + "epoch": 0.2475512021371327, + "ewc_loss": 3.7401914596557617e-06, + "grad_norm": 1.8548448085784912, + "learning_rate": 8.24501907587961e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8464304208755493, + "num_tokens": 74426129.0, + "step": 1946 + }, + { + "epoch": 0.2476784124157232, + "ewc_loss": 3.7401914596557617e-06, + "grad_norm": 1.9280025959014893, + "learning_rate": 8.249258160237388e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8496435880661011, + "num_tokens": 74460904.0, + "step": 1947 + }, + { + "epoch": 0.2478056226943137, + "ewc_loss": 3.7401914596557617e-06, + "grad_norm": 2.1573965549468994, + "learning_rate": 8.253497244595167e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.8470295667648315, + "num_tokens": 74497993.0, + "step": 1948 + }, + { + "epoch": 0.24793283297290422, + "ewc_loss": 3.7550926208496094e-06, + "grad_norm": 1.9443259239196777, + "learning_rate": 8.257736328952945e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8406962156295776, + "num_tokens": 74541108.0, + "step": 1949 + }, + { + "epoch": 0.24806004325149472, + "ewc_loss": 3.7550926208496094e-06, + "grad_norm": 1.8098430633544922, + "learning_rate": 8.261975413310725e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.850529670715332, + "num_tokens": 74579389.0, + "step": 1950 + }, + { + "epoch": 0.24818725353008522, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.1694135665893555, + "learning_rate": 8.266214497668503e-07, + "loss": 0.4238, + "mean_token_accuracy": 0.8657742142677307, + "num_tokens": 74621546.0, + "step": 1951 + }, + { + "epoch": 0.24831446380867575, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 1.9642237424850464, + "learning_rate": 8.270453582026283e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8369594812393188, + "num_tokens": 74660178.0, + "step": 1952 + }, + { + "epoch": 0.24844167408726625, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.0123910903930664, + "learning_rate": 8.274692666384061e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8567490577697754, + "num_tokens": 74693867.0, + "step": 1953 + }, + { + "epoch": 0.24856888436585675, + "ewc_loss": 3.7550926208496094e-06, + "grad_norm": 1.8553805351257324, + "learning_rate": 8.27893175074184e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.850165605545044, + "num_tokens": 74727561.0, + "step": 1954 + }, + { + "epoch": 0.24869609464444728, + "ewc_loss": 3.7550926208496094e-06, + "grad_norm": 1.903023362159729, + "learning_rate": 8.283170835099618e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8514954447746277, + "num_tokens": 74763719.0, + "step": 1955 + }, + { + "epoch": 0.24882330492303778, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 1.823047161102295, + "learning_rate": 8.287409919457396e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.859840989112854, + "num_tokens": 74802666.0, + "step": 1956 + }, + { + "epoch": 0.24895051520162828, + "ewc_loss": 3.7550926208496094e-06, + "grad_norm": 1.9896223545074463, + "learning_rate": 8.291649003815175e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8463761806488037, + "num_tokens": 74840307.0, + "step": 1957 + }, + { + "epoch": 0.2490777254802188, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 1.8547452688217163, + "learning_rate": 8.295888088172954e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8455744981765747, + "num_tokens": 74880181.0, + "step": 1958 + }, + { + "epoch": 0.2492049357588093, + "ewc_loss": 3.7550926208496094e-06, + "grad_norm": 1.818742275238037, + "learning_rate": 8.300127172530733e-07, + "loss": 0.3992, + "mean_token_accuracy": 0.8693777322769165, + "num_tokens": 74917148.0, + "step": 1959 + }, + { + "epoch": 0.2493321460373998, + "ewc_loss": 3.7550926208496094e-06, + "grad_norm": 1.980096459388733, + "learning_rate": 8.304366256888512e-07, + "loss": 0.4376, + "mean_token_accuracy": 0.857720136642456, + "num_tokens": 74953124.0, + "step": 1960 + }, + { + "epoch": 0.24945935631599034, + "ewc_loss": 3.7848949432373047e-06, + "grad_norm": 2.6942484378814697, + "learning_rate": 8.308605341246291e-07, + "loss": 0.449, + "mean_token_accuracy": 0.8549654483795166, + "num_tokens": 74989726.0, + "step": 1961 + }, + { + "epoch": 0.24958656659458084, + "ewc_loss": 3.7848949432373047e-06, + "grad_norm": 1.9770997762680054, + "learning_rate": 8.312844425604068e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8598756790161133, + "num_tokens": 75024849.0, + "step": 1962 + }, + { + "epoch": 0.24971377687317134, + "ewc_loss": 3.7848949432373047e-06, + "grad_norm": 1.7937977313995361, + "learning_rate": 8.317083509961848e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.848583996295929, + "num_tokens": 75066464.0, + "step": 1963 + }, + { + "epoch": 0.24984098715176187, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.9063581228256226, + "learning_rate": 8.321322594319626e-07, + "loss": 0.43, + "mean_token_accuracy": 0.8612157702445984, + "num_tokens": 75106829.0, + "step": 1964 + }, + { + "epoch": 0.24996819743035237, + "ewc_loss": 3.7848949432373047e-06, + "grad_norm": 2.0491602420806885, + "learning_rate": 8.325561678677405e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.846904993057251, + "num_tokens": 75148227.0, + "step": 1965 + }, + { + "epoch": 0.2500954077089429, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.9058253765106201, + "learning_rate": 8.329800763035184e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8396580219268799, + "num_tokens": 75188719.0, + "step": 1966 + }, + { + "epoch": 0.25022261798753337, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 2.205204725265503, + "learning_rate": 8.334039847392963e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8544690012931824, + "num_tokens": 75222260.0, + "step": 1967 + }, + { + "epoch": 0.2503498282661239, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.9734070301055908, + "learning_rate": 8.338278931750742e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8634529113769531, + "num_tokens": 75255489.0, + "step": 1968 + }, + { + "epoch": 0.2504770385447144, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.8542251586914062, + "learning_rate": 8.342518016108521e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8531966209411621, + "num_tokens": 75290548.0, + "step": 1969 + }, + { + "epoch": 0.2506042488233049, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.9903186559677124, + "learning_rate": 8.346757100466298e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8563341498374939, + "num_tokens": 75323912.0, + "step": 1970 + }, + { + "epoch": 0.2507314591018954, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.8870807886123657, + "learning_rate": 8.350996184824078e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.8398468494415283, + "num_tokens": 75361206.0, + "step": 1971 + }, + { + "epoch": 0.25085866938048595, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 20.46225357055664, + "learning_rate": 8.355235269181856e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8485356569290161, + "num_tokens": 75397292.0, + "step": 1972 + }, + { + "epoch": 0.2509858796590764, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.1180200576782227, + "learning_rate": 8.359474353539635e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.8613336086273193, + "num_tokens": 75437616.0, + "step": 1973 + }, + { + "epoch": 0.25111308993766696, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.739922046661377, + "learning_rate": 8.363713437897414e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8364216089248657, + "num_tokens": 75486172.0, + "step": 1974 + }, + { + "epoch": 0.2512403002162575, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8482229709625244, + "learning_rate": 8.367952522255193e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8506009578704834, + "num_tokens": 75526637.0, + "step": 1975 + }, + { + "epoch": 0.25136751049484796, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8148677349090576, + "learning_rate": 8.372191606612972e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8545727729797363, + "num_tokens": 75564632.0, + "step": 1976 + }, + { + "epoch": 0.2514947207734385, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.0119524002075195, + "learning_rate": 8.376430690970749e-07, + "loss": 0.4256, + "mean_token_accuracy": 0.8646377325057983, + "num_tokens": 75595553.0, + "step": 1977 + }, + { + "epoch": 0.251621931052029, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.7851861715316772, + "learning_rate": 8.380669775328528e-07, + "loss": 0.4209, + "mean_token_accuracy": 0.8652093410491943, + "num_tokens": 75635847.0, + "step": 1978 + }, + { + "epoch": 0.25174914133061954, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.9560750722885132, + "learning_rate": 8.384908859686307e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.8660950660705566, + "num_tokens": 75677403.0, + "step": 1979 + }, + { + "epoch": 0.25187635160921, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8278467655181885, + "learning_rate": 8.389147944044086e-07, + "loss": 0.4597, + "mean_token_accuracy": 0.8519986271858215, + "num_tokens": 75720703.0, + "step": 1980 + }, + { + "epoch": 0.25200356188780054, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.939160704612732, + "learning_rate": 8.393387028401864e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8455595970153809, + "num_tokens": 75758000.0, + "step": 1981 + }, + { + "epoch": 0.25213077216639107, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.145537853240967, + "learning_rate": 8.397626112759644e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8394282460212708, + "num_tokens": 75791113.0, + "step": 1982 + }, + { + "epoch": 0.25225798244498154, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.0248591899871826, + "learning_rate": 8.401865197117422e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8549681901931763, + "num_tokens": 75821733.0, + "step": 1983 + }, + { + "epoch": 0.25238519272357207, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.269317150115967, + "learning_rate": 8.406104281475202e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8570551872253418, + "num_tokens": 75861404.0, + "step": 1984 + }, + { + "epoch": 0.2525124030021626, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.7669312953948975, + "learning_rate": 8.410343365832979e-07, + "loss": 0.4371, + "mean_token_accuracy": 0.8576180934906006, + "num_tokens": 75903059.0, + "step": 1985 + }, + { + "epoch": 0.2526396132807531, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.7619881629943848, + "learning_rate": 8.414582450190758e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8573405742645264, + "num_tokens": 75947125.0, + "step": 1986 + }, + { + "epoch": 0.2527668235593436, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.7926523685455322, + "learning_rate": 8.418821534548537e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.8313262462615967, + "num_tokens": 75987519.0, + "step": 1987 + }, + { + "epoch": 0.25289403383793413, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.963792324066162, + "learning_rate": 8.423060618906316e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8541476726531982, + "num_tokens": 76027728.0, + "step": 1988 + }, + { + "epoch": 0.2530212441165246, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8353317975997925, + "learning_rate": 8.427299703264095e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8550684452056885, + "num_tokens": 76070942.0, + "step": 1989 + }, + { + "epoch": 0.25314845439511513, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8382676839828491, + "learning_rate": 8.431538787621874e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8471984267234802, + "num_tokens": 76113330.0, + "step": 1990 + }, + { + "epoch": 0.25327566467370566, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.877740502357483, + "learning_rate": 8.435777871979652e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.860910952091217, + "num_tokens": 76153511.0, + "step": 1991 + }, + { + "epoch": 0.25340287495229613, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.918121099472046, + "learning_rate": 8.440016956337432e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8541899919509888, + "num_tokens": 76191505.0, + "step": 1992 + }, + { + "epoch": 0.25353008523088666, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8086425065994263, + "learning_rate": 8.444256040695209e-07, + "loss": 0.4769, + "mean_token_accuracy": 0.849301278591156, + "num_tokens": 76232479.0, + "step": 1993 + }, + { + "epoch": 0.2536572955094772, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.9668083190917969, + "learning_rate": 8.448495125052988e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8507453799247742, + "num_tokens": 76270192.0, + "step": 1994 + }, + { + "epoch": 0.25378450578806766, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.986057162284851, + "learning_rate": 8.452734209410767e-07, + "loss": 0.4108, + "mean_token_accuracy": 0.868371844291687, + "num_tokens": 76303981.0, + "step": 1995 + }, + { + "epoch": 0.2539117160666582, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.863694667816162, + "learning_rate": 8.456973293768545e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.8583044409751892, + "num_tokens": 76338496.0, + "step": 1996 + }, + { + "epoch": 0.2540389263452487, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8379977941513062, + "learning_rate": 8.461212378126325e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.844569981098175, + "num_tokens": 76377904.0, + "step": 1997 + }, + { + "epoch": 0.2541661366238392, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8383351564407349, + "learning_rate": 8.465451462484103e-07, + "loss": 0.5121, + "mean_token_accuracy": 0.8391846418380737, + "num_tokens": 76413905.0, + "step": 1998 + }, + { + "epoch": 0.2542933469024297, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.9179357290267944, + "learning_rate": 8.469690546841882e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8523896932601929, + "num_tokens": 76451591.0, + "step": 1999 + }, + { + "epoch": 0.25442055718102025, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.9031453132629395, + "learning_rate": 8.47392963119966e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8577589988708496, + "num_tokens": 76486191.0, + "step": 2000 + }, + { + "epoch": 0.2545477674596107, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.8978475332260132, + "learning_rate": 8.478168715557439e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.8551600575447083, + "num_tokens": 76523317.0, + "step": 2001 + }, + { + "epoch": 0.25467497773820125, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.7832155227661133, + "learning_rate": 8.482407799915217e-07, + "loss": 0.4677, + "mean_token_accuracy": 0.8495858907699585, + "num_tokens": 76562661.0, + "step": 2002 + }, + { + "epoch": 0.2548021880167918, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 7.768195152282715, + "learning_rate": 8.486646884272997e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8505319952964783, + "num_tokens": 76600914.0, + "step": 2003 + }, + { + "epoch": 0.25492939829538225, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.1218221187591553, + "learning_rate": 8.490885968630775e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8433725833892822, + "num_tokens": 76636310.0, + "step": 2004 + }, + { + "epoch": 0.2550566085739728, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.7498923540115356, + "learning_rate": 8.495125052988555e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8539668321609497, + "num_tokens": 76680889.0, + "step": 2005 + }, + { + "epoch": 0.2551838188525633, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.792325496673584, + "learning_rate": 8.499364137346333e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8566360473632812, + "num_tokens": 76722469.0, + "step": 2006 + }, + { + "epoch": 0.2553110291311538, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.8835150003433228, + "learning_rate": 8.503603221704112e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.8417675495147705, + "num_tokens": 76757668.0, + "step": 2007 + }, + { + "epoch": 0.2554382394097443, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.944231629371643, + "learning_rate": 8.50784230606189e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.856122612953186, + "num_tokens": 76795607.0, + "step": 2008 + }, + { + "epoch": 0.25556544968833483, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.6911216974258423, + "learning_rate": 8.512081390419669e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8543416261672974, + "num_tokens": 76837161.0, + "step": 2009 + }, + { + "epoch": 0.2556926599669253, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.032208204269409, + "learning_rate": 8.516320474777447e-07, + "loss": 0.4643, + "mean_token_accuracy": 0.8533016443252563, + "num_tokens": 76872198.0, + "step": 2010 + }, + { + "epoch": 0.25581987024551583, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.807210922241211, + "learning_rate": 8.520559559135227e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.8647783994674683, + "num_tokens": 76908976.0, + "step": 2011 + }, + { + "epoch": 0.25594708052410636, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.9099597930908203, + "learning_rate": 8.524798643493005e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8503441214561462, + "num_tokens": 76947298.0, + "step": 2012 + }, + { + "epoch": 0.25607429080269684, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.9078550338745117, + "learning_rate": 8.529037727850785e-07, + "loss": 0.4367, + "mean_token_accuracy": 0.8591760396957397, + "num_tokens": 76985112.0, + "step": 2013 + }, + { + "epoch": 0.25620150108128736, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.870701789855957, + "learning_rate": 8.533276812208563e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.8679779171943665, + "num_tokens": 77020309.0, + "step": 2014 + }, + { + "epoch": 0.2563287113598779, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.69815731048584, + "learning_rate": 8.53751589656634e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8546455502510071, + "num_tokens": 77063245.0, + "step": 2015 + }, + { + "epoch": 0.25645592163846836, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.0593183040618896, + "learning_rate": 8.54175498092412e-07, + "loss": 0.4458, + "mean_token_accuracy": 0.8572622537612915, + "num_tokens": 77096791.0, + "step": 2016 + }, + { + "epoch": 0.2565831319170589, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 8.15287971496582, + "learning_rate": 8.545994065281898e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8638283610343933, + "num_tokens": 77129281.0, + "step": 2017 + }, + { + "epoch": 0.2567103421956494, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.339871644973755, + "learning_rate": 8.550233149639677e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8364018201828003, + "num_tokens": 77168785.0, + "step": 2018 + }, + { + "epoch": 0.2568375524742399, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.0620739459991455, + "learning_rate": 8.554472233997456e-07, + "loss": 0.4811, + "mean_token_accuracy": 0.8437190055847168, + "num_tokens": 77208360.0, + "step": 2019 + }, + { + "epoch": 0.2569647627528304, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.1971118450164795, + "learning_rate": 8.558711318355235e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8343685865402222, + "num_tokens": 77245539.0, + "step": 2020 + }, + { + "epoch": 0.25709197303142095, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.95154869556427, + "learning_rate": 8.562950402713014e-07, + "loss": 0.4163, + "mean_token_accuracy": 0.871511697769165, + "num_tokens": 77279118.0, + "step": 2021 + }, + { + "epoch": 0.2572191833100114, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.019223213195801, + "learning_rate": 8.567189487070793e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8426236510276794, + "num_tokens": 77316259.0, + "step": 2022 + }, + { + "epoch": 0.25734639358860195, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.841242790222168, + "learning_rate": 8.57142857142857e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8364469408988953, + "num_tokens": 77359712.0, + "step": 2023 + }, + { + "epoch": 0.2574736038671925, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.290194272994995, + "learning_rate": 8.57566765578635e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8671281933784485, + "num_tokens": 77402269.0, + "step": 2024 + }, + { + "epoch": 0.25760081414578295, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.7292250394821167, + "learning_rate": 8.579906740144128e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.8563094735145569, + "num_tokens": 77445791.0, + "step": 2025 + }, + { + "epoch": 0.2577280244243735, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.8808646202087402, + "learning_rate": 8.584145824501907e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8410947322845459, + "num_tokens": 77485376.0, + "step": 2026 + }, + { + "epoch": 0.257855234702964, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.8118311166763306, + "learning_rate": 8.588384908859686e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8487485647201538, + "num_tokens": 77524344.0, + "step": 2027 + }, + { + "epoch": 0.25798244498155454, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.8573380708694458, + "learning_rate": 8.592623993217465e-07, + "loss": 0.4758, + "mean_token_accuracy": 0.8515651822090149, + "num_tokens": 77562746.0, + "step": 2028 + }, + { + "epoch": 0.258109655260145, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.032376766204834, + "learning_rate": 8.596863077575244e-07, + "loss": 0.4972, + "mean_token_accuracy": 0.8455735445022583, + "num_tokens": 77601261.0, + "step": 2029 + }, + { + "epoch": 0.25823686553873554, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.2893593311309814, + "learning_rate": 8.601102161933023e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8484802842140198, + "num_tokens": 77640054.0, + "step": 2030 + }, + { + "epoch": 0.25836407581732607, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.9117059707641602, + "learning_rate": 8.6053412462908e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8557327389717102, + "num_tokens": 77680098.0, + "step": 2031 + }, + { + "epoch": 0.25849128609591654, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.15836238861084, + "learning_rate": 8.60958033064858e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.858853816986084, + "num_tokens": 77712517.0, + "step": 2032 + }, + { + "epoch": 0.25861849637450707, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.053290605545044, + "learning_rate": 8.613819415006358e-07, + "loss": 0.5102, + "mean_token_accuracy": 0.845826268196106, + "num_tokens": 77751335.0, + "step": 2033 + }, + { + "epoch": 0.2587457066530976, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.7793073654174805, + "learning_rate": 8.618058499364137e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8421481847763062, + "num_tokens": 77792300.0, + "step": 2034 + }, + { + "epoch": 0.25887291693168807, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.8511375188827515, + "learning_rate": 8.622297583721916e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8575373291969299, + "num_tokens": 77836391.0, + "step": 2035 + }, + { + "epoch": 0.2590001272102786, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.8169922828674316, + "learning_rate": 8.626536668079695e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8499937653541565, + "num_tokens": 77871600.0, + "step": 2036 + }, + { + "epoch": 0.2591273374888691, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.002253770828247, + "learning_rate": 8.630775752437474e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8545371890068054, + "num_tokens": 77904623.0, + "step": 2037 + }, + { + "epoch": 0.2592545477674596, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.1803481578826904, + "learning_rate": 8.635014836795251e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8445937633514404, + "num_tokens": 77941145.0, + "step": 2038 + }, + { + "epoch": 0.2593817580460501, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.9596235752105713, + "learning_rate": 8.63925392115303e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8606685400009155, + "num_tokens": 77975087.0, + "step": 2039 + }, + { + "epoch": 0.25950896832464065, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.0399279594421387, + "learning_rate": 8.643493005510809e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8590381145477295, + "num_tokens": 78009526.0, + "step": 2040 + }, + { + "epoch": 0.2596361786032311, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.8966765403747559, + "learning_rate": 8.647732089868588e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8358791470527649, + "num_tokens": 78050918.0, + "step": 2041 + }, + { + "epoch": 0.25976338888182166, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.9831358194351196, + "learning_rate": 8.651971174226366e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8615109324455261, + "num_tokens": 78085547.0, + "step": 2042 + }, + { + "epoch": 0.2598905991604122, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 2.01167893409729, + "learning_rate": 8.656210258584146e-07, + "loss": 0.501, + "mean_token_accuracy": 0.8393335342407227, + "num_tokens": 78122916.0, + "step": 2043 + }, + { + "epoch": 0.26001780943900266, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 1.88616144657135, + "learning_rate": 8.660449342941924e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8412166833877563, + "num_tokens": 78161800.0, + "step": 2044 + }, + { + "epoch": 0.2601450197175932, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 1.849889874458313, + "learning_rate": 8.664688427299704e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8502309918403625, + "num_tokens": 78199961.0, + "step": 2045 + }, + { + "epoch": 0.2602722299961837, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 1.9862221479415894, + "learning_rate": 8.668927511657481e-07, + "loss": 0.431, + "mean_token_accuracy": 0.8600364923477173, + "num_tokens": 78231332.0, + "step": 2046 + }, + { + "epoch": 0.2603994402747742, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.9913030862808228, + "learning_rate": 8.67316659601526e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8436866998672485, + "num_tokens": 78265054.0, + "step": 2047 + }, + { + "epoch": 0.2605266505533647, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.8533285856246948, + "learning_rate": 8.677405680373039e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8609939813613892, + "num_tokens": 78305445.0, + "step": 2048 + }, + { + "epoch": 0.26065386083195524, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.9966992139816284, + "learning_rate": 8.681644764730818e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8491854667663574, + "num_tokens": 78339174.0, + "step": 2049 + }, + { + "epoch": 0.2607810711105457, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.8323915004730225, + "learning_rate": 8.685883849088596e-07, + "loss": 0.4152, + "mean_token_accuracy": 0.8676420450210571, + "num_tokens": 78377630.0, + "step": 2050 + }, + { + "epoch": 0.26090828138913624, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.8956629037857056, + "learning_rate": 8.690122933446376e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8512976765632629, + "num_tokens": 78419908.0, + "step": 2051 + }, + { + "epoch": 0.26103549166772677, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.7463350296020508, + "learning_rate": 8.694362017804154e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.8606270551681519, + "num_tokens": 78459561.0, + "step": 2052 + }, + { + "epoch": 0.26116270194631724, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.86009681224823, + "learning_rate": 8.698601102161933e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8495168685913086, + "num_tokens": 78502500.0, + "step": 2053 + }, + { + "epoch": 0.26128991222490777, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.898707389831543, + "learning_rate": 8.702840186519711e-07, + "loss": 0.4285, + "mean_token_accuracy": 0.8627304434776306, + "num_tokens": 78545411.0, + "step": 2054 + }, + { + "epoch": 0.2614171225034983, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.8879187107086182, + "learning_rate": 8.70707927087749e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.8638619184494019, + "num_tokens": 78585878.0, + "step": 2055 + }, + { + "epoch": 0.2615443327820888, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.041877031326294, + "learning_rate": 8.711318355235269e-07, + "loss": 0.5162, + "mean_token_accuracy": 0.8334451913833618, + "num_tokens": 78621233.0, + "step": 2056 + }, + { + "epoch": 0.2616715430606793, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.1125848293304443, + "learning_rate": 8.715557439593047e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8586910963058472, + "num_tokens": 78654644.0, + "step": 2057 + }, + { + "epoch": 0.26179875333926983, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9890968799591064, + "learning_rate": 8.719796523950826e-07, + "loss": 0.5134, + "mean_token_accuracy": 0.8362382054328918, + "num_tokens": 78692851.0, + "step": 2058 + }, + { + "epoch": 0.2619259636178603, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8084678649902344, + "learning_rate": 8.724035608308605e-07, + "loss": 0.4075, + "mean_token_accuracy": 0.8687819242477417, + "num_tokens": 78725329.0, + "step": 2059 + }, + { + "epoch": 0.26205317389645083, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.18455171585083, + "learning_rate": 8.728274692666384e-07, + "loss": 0.4635, + "mean_token_accuracy": 0.8524295091629028, + "num_tokens": 78766645.0, + "step": 2060 + }, + { + "epoch": 0.26218038417504136, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.955035924911499, + "learning_rate": 8.732513777024162e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8542727828025818, + "num_tokens": 78801576.0, + "step": 2061 + }, + { + "epoch": 0.26230759445363183, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8342535495758057, + "learning_rate": 8.736752861381941e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8406252861022949, + "num_tokens": 78845577.0, + "step": 2062 + }, + { + "epoch": 0.26243480473222236, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8609541654586792, + "learning_rate": 8.740991945739719e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8525346517562866, + "num_tokens": 78884229.0, + "step": 2063 + }, + { + "epoch": 0.2625620150108129, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8167067766189575, + "learning_rate": 8.745231030097499e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8548956513404846, + "num_tokens": 78926948.0, + "step": 2064 + }, + { + "epoch": 0.26268922528940336, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.860745906829834, + "learning_rate": 8.749470114455277e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8621779680252075, + "num_tokens": 78963582.0, + "step": 2065 + }, + { + "epoch": 0.2628164355679939, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.050305128097534, + "learning_rate": 8.753709198813056e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8305181860923767, + "num_tokens": 79007345.0, + "step": 2066 + }, + { + "epoch": 0.2629436458465844, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.788103699684143, + "learning_rate": 8.757948283170835e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.862846314907074, + "num_tokens": 79049505.0, + "step": 2067 + }, + { + "epoch": 0.2630708561251749, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.7956995964050293, + "learning_rate": 8.762187367528613e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.8605251312255859, + "num_tokens": 79087419.0, + "step": 2068 + }, + { + "epoch": 0.2631980664037654, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.006925582885742, + "learning_rate": 8.766426451886392e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8465701341629028, + "num_tokens": 79131444.0, + "step": 2069 + }, + { + "epoch": 0.26332527668235595, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8586819171905518, + "learning_rate": 8.770665536244171e-07, + "loss": 0.5046, + "mean_token_accuracy": 0.843573808670044, + "num_tokens": 79171976.0, + "step": 2070 + }, + { + "epoch": 0.2634524869609464, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8943687677383423, + "learning_rate": 8.774904620601949e-07, + "loss": 0.434, + "mean_token_accuracy": 0.8586626052856445, + "num_tokens": 79206273.0, + "step": 2071 + }, + { + "epoch": 0.26357969723953695, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9224610328674316, + "learning_rate": 8.779143704959729e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.8611322045326233, + "num_tokens": 79245564.0, + "step": 2072 + }, + { + "epoch": 0.2637069075181275, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.933428168296814, + "learning_rate": 8.783382789317507e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8523789644241333, + "num_tokens": 79281857.0, + "step": 2073 + }, + { + "epoch": 0.26383411779671795, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9995758533477783, + "learning_rate": 8.787621873675286e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8372459411621094, + "num_tokens": 79312482.0, + "step": 2074 + }, + { + "epoch": 0.2639613280753085, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8696873188018799, + "learning_rate": 8.791860958033065e-07, + "loss": 0.4387, + "mean_token_accuracy": 0.8612040281295776, + "num_tokens": 79348157.0, + "step": 2075 + }, + { + "epoch": 0.264088538353899, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9639326333999634, + "learning_rate": 8.796100042390842e-07, + "loss": 0.4252, + "mean_token_accuracy": 0.8642102479934692, + "num_tokens": 79387540.0, + "step": 2076 + }, + { + "epoch": 0.2642157486324895, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.0257534980773926, + "learning_rate": 8.800339126748622e-07, + "loss": 0.4408, + "mean_token_accuracy": 0.8592838644981384, + "num_tokens": 79421424.0, + "step": 2077 + }, + { + "epoch": 0.26434295891108, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9096190929412842, + "learning_rate": 8.8045782111064e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8378798961639404, + "num_tokens": 79458733.0, + "step": 2078 + }, + { + "epoch": 0.26447016918967053, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9551085233688354, + "learning_rate": 8.808817295464179e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8536569476127625, + "num_tokens": 79495394.0, + "step": 2079 + }, + { + "epoch": 0.26459737946826106, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9784280061721802, + "learning_rate": 8.813056379821958e-07, + "loss": 0.435, + "mean_token_accuracy": 0.861983060836792, + "num_tokens": 79529799.0, + "step": 2080 + }, + { + "epoch": 0.26472458974685154, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.8963096141815186, + "learning_rate": 8.817295464179737e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.8526642322540283, + "num_tokens": 79564664.0, + "step": 2081 + }, + { + "epoch": 0.26485180002544206, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.0491816997528076, + "learning_rate": 8.821534548537515e-07, + "loss": 0.537, + "mean_token_accuracy": 0.8277983665466309, + "num_tokens": 79606104.0, + "step": 2082 + }, + { + "epoch": 0.2649790103040326, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9663043022155762, + "learning_rate": 8.825773632895295e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.8612164258956909, + "num_tokens": 79647688.0, + "step": 2083 + }, + { + "epoch": 0.26510622058262306, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.0214297771453857, + "learning_rate": 8.830012717253072e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8573879599571228, + "num_tokens": 79685500.0, + "step": 2084 + }, + { + "epoch": 0.2652334308612136, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.928167700767517, + "learning_rate": 8.834251801610852e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8466504812240601, + "num_tokens": 79729912.0, + "step": 2085 + }, + { + "epoch": 0.2653606411398041, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.011166572570801, + "learning_rate": 8.83849088596863e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.8618062734603882, + "num_tokens": 79767487.0, + "step": 2086 + }, + { + "epoch": 0.2654878514183946, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8157563209533691, + "learning_rate": 8.842729970326409e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.8681166768074036, + "num_tokens": 79804802.0, + "step": 2087 + }, + { + "epoch": 0.2656150616969851, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.8678863048553467, + "learning_rate": 8.846969054684188e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.8326097726821899, + "num_tokens": 79845685.0, + "step": 2088 + }, + { + "epoch": 0.26574227197557565, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.0065019130706787, + "learning_rate": 8.851208139041967e-07, + "loss": 0.4235, + "mean_token_accuracy": 0.8639804124832153, + "num_tokens": 79884692.0, + "step": 2089 + }, + { + "epoch": 0.2658694822541661, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.0798301696777344, + "learning_rate": 8.855447223399745e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8432647585868835, + "num_tokens": 79919780.0, + "step": 2090 + }, + { + "epoch": 0.26599669253275665, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9222530126571655, + "learning_rate": 8.859686307757524e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8391655683517456, + "num_tokens": 79958317.0, + "step": 2091 + }, + { + "epoch": 0.2661239028113472, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.0118024349212646, + "learning_rate": 8.863925392115302e-07, + "loss": 0.4426, + "mean_token_accuracy": 0.8568423986434937, + "num_tokens": 79994126.0, + "step": 2092 + }, + { + "epoch": 0.26625111308993765, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.80849027633667, + "learning_rate": 8.868164476473082e-07, + "loss": 0.4109, + "mean_token_accuracy": 0.8678229451179504, + "num_tokens": 80033157.0, + "step": 2093 + }, + { + "epoch": 0.2663783233685282, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.8568110466003418, + "learning_rate": 8.87240356083086e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.859695315361023, + "num_tokens": 80073354.0, + "step": 2094 + }, + { + "epoch": 0.2665055336471187, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.8500096797943115, + "learning_rate": 8.876642645188639e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.8395557403564453, + "num_tokens": 80111819.0, + "step": 2095 + }, + { + "epoch": 0.2666327439257092, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.9027551412582397, + "learning_rate": 8.880881729546418e-07, + "loss": 0.4143, + "mean_token_accuracy": 0.8671389818191528, + "num_tokens": 80147951.0, + "step": 2096 + }, + { + "epoch": 0.2667599542042997, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.7310428619384766, + "learning_rate": 8.885120813904197e-07, + "loss": 0.43, + "mean_token_accuracy": 0.8621235489845276, + "num_tokens": 80191319.0, + "step": 2097 + }, + { + "epoch": 0.26688716448289024, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.1383278369903564, + "learning_rate": 8.889359898261976e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8395441770553589, + "num_tokens": 80226300.0, + "step": 2098 + }, + { + "epoch": 0.2670143747614807, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.84525465965271, + "learning_rate": 8.893598982619753e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.8583124876022339, + "num_tokens": 80265465.0, + "step": 2099 + }, + { + "epoch": 0.26714158504007124, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.7137354612350464, + "learning_rate": 8.897838066977532e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8581833839416504, + "num_tokens": 80312696.0, + "step": 2100 + }, + { + "epoch": 0.26726879531866177, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.8964698314666748, + "learning_rate": 8.902077151335311e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8512663245201111, + "num_tokens": 80353507.0, + "step": 2101 + }, + { + "epoch": 0.26739600559725224, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.9528679847717285, + "learning_rate": 8.90631623569309e-07, + "loss": 0.5475, + "mean_token_accuracy": 0.8235965967178345, + "num_tokens": 80390795.0, + "step": 2102 + }, + { + "epoch": 0.26752321587584277, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.8180960416793823, + "learning_rate": 8.910555320050868e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8533657789230347, + "num_tokens": 80427599.0, + "step": 2103 + }, + { + "epoch": 0.2676504261544333, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.9555753469467163, + "learning_rate": 8.914794404408648e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8543318510055542, + "num_tokens": 80468084.0, + "step": 2104 + }, + { + "epoch": 0.26777763643302377, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.0019166469573975, + "learning_rate": 8.919033488766426e-07, + "loss": 0.404, + "mean_token_accuracy": 0.8697470426559448, + "num_tokens": 80507218.0, + "step": 2105 + }, + { + "epoch": 0.2679048467116143, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.0462470054626465, + "learning_rate": 8.923272573124204e-07, + "loss": 0.489, + "mean_token_accuracy": 0.8456920385360718, + "num_tokens": 80541730.0, + "step": 2106 + }, + { + "epoch": 0.2680320569902048, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.832947015762329, + "learning_rate": 8.927511657481983e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8400439023971558, + "num_tokens": 80584677.0, + "step": 2107 + }, + { + "epoch": 0.2681592672687953, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.8500263690948486, + "learning_rate": 8.931750741839762e-07, + "loss": 0.4333, + "mean_token_accuracy": 0.8630548715591431, + "num_tokens": 80616332.0, + "step": 2108 + }, + { + "epoch": 0.2682864775473858, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.8792074918746948, + "learning_rate": 8.935989826197541e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8600258827209473, + "num_tokens": 80654435.0, + "step": 2109 + }, + { + "epoch": 0.26841368782597635, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.022535562515259, + "learning_rate": 8.94022891055532e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.8686869740486145, + "num_tokens": 80691246.0, + "step": 2110 + }, + { + "epoch": 0.2685408981045668, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8919342756271362, + "learning_rate": 8.944467994913098e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8461667895317078, + "num_tokens": 80730510.0, + "step": 2111 + }, + { + "epoch": 0.26866810838315736, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8292607069015503, + "learning_rate": 8.948707079270878e-07, + "loss": 0.3899, + "mean_token_accuracy": 0.8744409084320068, + "num_tokens": 80769369.0, + "step": 2112 + }, + { + "epoch": 0.2687953186617479, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.93995201587677, + "learning_rate": 8.952946163628656e-07, + "loss": 0.4441, + "mean_token_accuracy": 0.8577234745025635, + "num_tokens": 80802168.0, + "step": 2113 + }, + { + "epoch": 0.26892252894033836, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.7813278436660767, + "learning_rate": 8.957185247986434e-07, + "loss": 0.3959, + "mean_token_accuracy": 0.8732919692993164, + "num_tokens": 80839389.0, + "step": 2114 + }, + { + "epoch": 0.2690497392189289, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.0160655975341797, + "learning_rate": 8.961424332344213e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.8571375608444214, + "num_tokens": 80881139.0, + "step": 2115 + }, + { + "epoch": 0.2691769494975194, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.019818067550659, + "learning_rate": 8.965663416701992e-07, + "loss": 0.4514, + "mean_token_accuracy": 0.8542640805244446, + "num_tokens": 80919809.0, + "step": 2116 + }, + { + "epoch": 0.2693041597761099, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.0152587890625, + "learning_rate": 8.969902501059771e-07, + "loss": 0.4279, + "mean_token_accuracy": 0.8653221130371094, + "num_tokens": 80956860.0, + "step": 2117 + }, + { + "epoch": 0.2694313700547004, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8843916654586792, + "learning_rate": 8.97414158541755e-07, + "loss": 0.3959, + "mean_token_accuracy": 0.8741077184677124, + "num_tokens": 80994322.0, + "step": 2118 + }, + { + "epoch": 0.26955858033329094, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.208138942718506, + "learning_rate": 8.978380669775328e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8482242822647095, + "num_tokens": 81032792.0, + "step": 2119 + }, + { + "epoch": 0.2696857906118814, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.0232057571411133, + "learning_rate": 8.982619754133107e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8497242331504822, + "num_tokens": 81070011.0, + "step": 2120 + }, + { + "epoch": 0.26981300089047194, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.7974226474761963, + "learning_rate": 8.986858838490886e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8468062281608582, + "num_tokens": 81113145.0, + "step": 2121 + }, + { + "epoch": 0.26994021116906247, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.766291856765747, + "learning_rate": 8.991097922848663e-07, + "loss": 0.447, + "mean_token_accuracy": 0.8579858541488647, + "num_tokens": 81154175.0, + "step": 2122 + }, + { + "epoch": 0.27006742144765294, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.8738553524017334, + "learning_rate": 8.995337007206443e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8501635789871216, + "num_tokens": 81193319.0, + "step": 2123 + }, + { + "epoch": 0.2701946317262435, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.844123125076294, + "learning_rate": 8.999576091564221e-07, + "loss": 0.44, + "mean_token_accuracy": 0.8604292869567871, + "num_tokens": 81229057.0, + "step": 2124 + }, + { + "epoch": 0.270321842004834, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.0029733180999756, + "learning_rate": 9.003815175922001e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.8564585447311401, + "num_tokens": 81262380.0, + "step": 2125 + }, + { + "epoch": 0.2704490522834245, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.7510930299758911, + "learning_rate": 9.008054260279779e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.855628490447998, + "num_tokens": 81304080.0, + "step": 2126 + }, + { + "epoch": 0.270576262562015, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.9016200304031372, + "learning_rate": 9.012293344637558e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8468883633613586, + "num_tokens": 81340226.0, + "step": 2127 + }, + { + "epoch": 0.27070347284060553, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.7759318351745605, + "learning_rate": 9.016532428995337e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.861742377281189, + "num_tokens": 81378046.0, + "step": 2128 + }, + { + "epoch": 0.27083068311919606, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.8339104652404785, + "learning_rate": 9.020771513353115e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8489277362823486, + "num_tokens": 81419486.0, + "step": 2129 + }, + { + "epoch": 0.27095789339778653, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.9802461862564087, + "learning_rate": 9.025010597710894e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8485434651374817, + "num_tokens": 81457488.0, + "step": 2130 + }, + { + "epoch": 0.27108510367637706, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8025399446487427, + "learning_rate": 9.029249682068673e-07, + "loss": 0.44, + "mean_token_accuracy": 0.8587172031402588, + "num_tokens": 81494812.0, + "step": 2131 + }, + { + "epoch": 0.2712123139549676, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.9193075895309448, + "learning_rate": 9.033488766426451e-07, + "loss": 0.456, + "mean_token_accuracy": 0.8529258966445923, + "num_tokens": 81531421.0, + "step": 2132 + }, + { + "epoch": 0.27133952423355806, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8151967525482178, + "learning_rate": 9.037727850784231e-07, + "loss": 0.423, + "mean_token_accuracy": 0.8628178834915161, + "num_tokens": 81567221.0, + "step": 2133 + }, + { + "epoch": 0.2714667345121486, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.7583682537078857, + "learning_rate": 9.041966935142009e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.854076623916626, + "num_tokens": 81605832.0, + "step": 2134 + }, + { + "epoch": 0.2715939447907391, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.9047170877456665, + "learning_rate": 9.046206019499788e-07, + "loss": 0.4137, + "mean_token_accuracy": 0.8693791627883911, + "num_tokens": 81646919.0, + "step": 2135 + }, + { + "epoch": 0.2717211550693296, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8216248750686646, + "learning_rate": 9.050445103857567e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8471719026565552, + "num_tokens": 81687214.0, + "step": 2136 + }, + { + "epoch": 0.2718483653479201, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.900923252105713, + "learning_rate": 9.054684188215344e-07, + "loss": 0.3969, + "mean_token_accuracy": 0.8740954399108887, + "num_tokens": 81721437.0, + "step": 2137 + }, + { + "epoch": 0.27197557562651065, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.0424108505249023, + "learning_rate": 9.058923272573124e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8500878810882568, + "num_tokens": 81756595.0, + "step": 2138 + }, + { + "epoch": 0.2721027859051011, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8287937641143799, + "learning_rate": 9.063162356930902e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.861848771572113, + "num_tokens": 81796198.0, + "step": 2139 + }, + { + "epoch": 0.27222999618369165, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.040548801422119, + "learning_rate": 9.067401441288681e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8539695739746094, + "num_tokens": 81833002.0, + "step": 2140 + }, + { + "epoch": 0.2723572064622822, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.993224024772644, + "learning_rate": 9.07164052564646e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.8548153638839722, + "num_tokens": 81872089.0, + "step": 2141 + }, + { + "epoch": 0.27248441674087265, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.9402549266815186, + "learning_rate": 9.075879610004239e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.848025918006897, + "num_tokens": 81908356.0, + "step": 2142 + }, + { + "epoch": 0.2726116270194632, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.7314127683639526, + "learning_rate": 9.080118694362017e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8458738327026367, + "num_tokens": 81950251.0, + "step": 2143 + }, + { + "epoch": 0.2727388372980537, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8403087854385376, + "learning_rate": 9.084357778719796e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8483963012695312, + "num_tokens": 81987157.0, + "step": 2144 + }, + { + "epoch": 0.2728660475766442, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.9453657865524292, + "learning_rate": 9.088596863077574e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8586416244506836, + "num_tokens": 82023633.0, + "step": 2145 + }, + { + "epoch": 0.2729932578552347, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.016645669937134, + "learning_rate": 9.092835947435354e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8515332937240601, + "num_tokens": 82061311.0, + "step": 2146 + }, + { + "epoch": 0.27312046813382523, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8825404644012451, + "learning_rate": 9.097075031793132e-07, + "loss": 0.493, + "mean_token_accuracy": 0.843936026096344, + "num_tokens": 82098370.0, + "step": 2147 + }, + { + "epoch": 0.2732476784124157, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8881666660308838, + "learning_rate": 9.101314116150911e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8703776597976685, + "num_tokens": 82134931.0, + "step": 2148 + }, + { + "epoch": 0.27337488869100623, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8779925107955933, + "learning_rate": 9.10555320050869e-07, + "loss": 0.4304, + "mean_token_accuracy": 0.863031268119812, + "num_tokens": 82171391.0, + "step": 2149 + }, + { + "epoch": 0.27350209896959676, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.219425678253174, + "learning_rate": 9.109792284866469e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8602610230445862, + "num_tokens": 82214064.0, + "step": 2150 + }, + { + "epoch": 0.27362930924818724, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.9071601629257202, + "learning_rate": 9.114031369224247e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8526268601417542, + "num_tokens": 82252774.0, + "step": 2151 + }, + { + "epoch": 0.27375651952677776, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.9142979383468628, + "learning_rate": 9.118270453582026e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8392778038978577, + "num_tokens": 82291645.0, + "step": 2152 + }, + { + "epoch": 0.2738837298053683, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.7389016151428223, + "learning_rate": 9.122509537939804e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8512544631958008, + "num_tokens": 82333643.0, + "step": 2153 + }, + { + "epoch": 0.27401094008395877, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.8816802501678467, + "learning_rate": 9.126748622297584e-07, + "loss": 0.49, + "mean_token_accuracy": 0.8422470092773438, + "num_tokens": 82370792.0, + "step": 2154 + }, + { + "epoch": 0.2741381503625493, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.02665114402771, + "learning_rate": 9.130987706655362e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8462226390838623, + "num_tokens": 82408626.0, + "step": 2155 + }, + { + "epoch": 0.2742653606411398, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.9748797416687012, + "learning_rate": 9.135226791013141e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8464738726615906, + "num_tokens": 82448401.0, + "step": 2156 + }, + { + "epoch": 0.2743925709197303, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 2.145400285720825, + "learning_rate": 9.13946587537092e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8505059480667114, + "num_tokens": 82476644.0, + "step": 2157 + }, + { + "epoch": 0.2745197811983208, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.9841303825378418, + "learning_rate": 9.143704959728699e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8475619554519653, + "num_tokens": 82512722.0, + "step": 2158 + }, + { + "epoch": 0.27464699147691135, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.9568800926208496, + "learning_rate": 9.147944044086476e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.845028281211853, + "num_tokens": 82545393.0, + "step": 2159 + }, + { + "epoch": 0.2747742017555018, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.8378536701202393, + "learning_rate": 9.152183128444255e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8498004674911499, + "num_tokens": 82586329.0, + "step": 2160 + }, + { + "epoch": 0.27490141203409235, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.9365925788879395, + "learning_rate": 9.156422212802034e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8380417227745056, + "num_tokens": 82625563.0, + "step": 2161 + }, + { + "epoch": 0.2750286223126829, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.0756194591522217, + "learning_rate": 9.160661297159813e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8616291284561157, + "num_tokens": 82660030.0, + "step": 2162 + }, + { + "epoch": 0.27515583259127335, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 1.991018533706665, + "learning_rate": 9.164900381517592e-07, + "loss": 0.4539, + "mean_token_accuracy": 0.8546863794326782, + "num_tokens": 82698794.0, + "step": 2163 + }, + { + "epoch": 0.2752830428698639, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 1.8169584274291992, + "learning_rate": 9.16913946587537e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8524254560470581, + "num_tokens": 82740015.0, + "step": 2164 + }, + { + "epoch": 0.2754102531484544, + "ewc_loss": 4.26173210144043e-06, + "grad_norm": 1.963813304901123, + "learning_rate": 9.17337855023315e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8500979542732239, + "num_tokens": 82775765.0, + "step": 2165 + }, + { + "epoch": 0.2755374634270449, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 1.9116730690002441, + "learning_rate": 9.177617634590928e-07, + "loss": 0.4193, + "mean_token_accuracy": 0.8659987449645996, + "num_tokens": 82814324.0, + "step": 2166 + }, + { + "epoch": 0.2756646737056354, + "ewc_loss": 4.26173210144043e-06, + "grad_norm": 1.8797770738601685, + "learning_rate": 9.181856718948706e-07, + "loss": 0.4061, + "mean_token_accuracy": 0.868037223815918, + "num_tokens": 82851007.0, + "step": 2167 + }, + { + "epoch": 0.27579188398422594, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.0652217864990234, + "learning_rate": 9.186095803306485e-07, + "loss": 0.448, + "mean_token_accuracy": 0.855864405632019, + "num_tokens": 82886846.0, + "step": 2168 + }, + { + "epoch": 0.2759190942628164, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.059802532196045, + "learning_rate": 9.190334887664264e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8440310955047607, + "num_tokens": 82929607.0, + "step": 2169 + }, + { + "epoch": 0.27604630454140694, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.1444127559661865, + "learning_rate": 9.194573972022043e-07, + "loss": 0.544, + "mean_token_accuracy": 0.8347272872924805, + "num_tokens": 82965089.0, + "step": 2170 + }, + { + "epoch": 0.27617351481999747, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 1.8960646390914917, + "learning_rate": 9.198813056379822e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8350613117218018, + "num_tokens": 83008345.0, + "step": 2171 + }, + { + "epoch": 0.27630072509858794, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 1.8442081212997437, + "learning_rate": 9.2030521407376e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8459411263465881, + "num_tokens": 83043763.0, + "step": 2172 + }, + { + "epoch": 0.27642793537717847, + "ewc_loss": 4.26173210144043e-06, + "grad_norm": 1.7949411869049072, + "learning_rate": 9.20729122509538e-07, + "loss": 0.4179, + "mean_token_accuracy": 0.8663381934165955, + "num_tokens": 83081389.0, + "step": 2173 + }, + { + "epoch": 0.276555145655769, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 1.9503917694091797, + "learning_rate": 9.211530309453158e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8525704145431519, + "num_tokens": 83120547.0, + "step": 2174 + }, + { + "epoch": 0.27668235593435947, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.0136561393737793, + "learning_rate": 9.215769393810936e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8540436029434204, + "num_tokens": 83157229.0, + "step": 2175 + }, + { + "epoch": 0.27680956621295, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 1.9128849506378174, + "learning_rate": 9.220008478168715e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8276156187057495, + "num_tokens": 83198217.0, + "step": 2176 + }, + { + "epoch": 0.2769367764915405, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.238696336746216, + "learning_rate": 9.224247562526494e-07, + "loss": 0.4631, + "mean_token_accuracy": 0.8495256304740906, + "num_tokens": 83238750.0, + "step": 2177 + }, + { + "epoch": 0.277063986770131, + "ewc_loss": 4.26173210144043e-06, + "grad_norm": 1.9117425680160522, + "learning_rate": 9.228486646884273e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8545587062835693, + "num_tokens": 83280138.0, + "step": 2178 + }, + { + "epoch": 0.2771911970487215, + "ewc_loss": 4.26173210144043e-06, + "grad_norm": 1.9823687076568604, + "learning_rate": 9.232725731242052e-07, + "loss": 0.5031, + "mean_token_accuracy": 0.838060736656189, + "num_tokens": 83320633.0, + "step": 2179 + }, + { + "epoch": 0.27731840732731206, + "ewc_loss": 4.26173210144043e-06, + "grad_norm": 1.8896557092666626, + "learning_rate": 9.23696481559983e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8439151644706726, + "num_tokens": 83358787.0, + "step": 2180 + }, + { + "epoch": 0.2774456176059026, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 1.8136286735534668, + "learning_rate": 9.24120389995761e-07, + "loss": 0.431, + "mean_token_accuracy": 0.8627020120620728, + "num_tokens": 83400609.0, + "step": 2181 + }, + { + "epoch": 0.27757282788449306, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 1.9637974500656128, + "learning_rate": 9.245442984315387e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8565177917480469, + "num_tokens": 83436058.0, + "step": 2182 + }, + { + "epoch": 0.2777000381630836, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 1.8108179569244385, + "learning_rate": 9.249682068673165e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8465147614479065, + "num_tokens": 83474542.0, + "step": 2183 + }, + { + "epoch": 0.2778272484416741, + "ewc_loss": 4.26173210144043e-06, + "grad_norm": 2.0130157470703125, + "learning_rate": 9.253921153030945e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8360236883163452, + "num_tokens": 83515551.0, + "step": 2184 + }, + { + "epoch": 0.2779544587202646, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 1.8732900619506836, + "learning_rate": 9.258160237388723e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.863406240940094, + "num_tokens": 83551826.0, + "step": 2185 + }, + { + "epoch": 0.2780816689988551, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.939211368560791, + "learning_rate": 9.262399321746503e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8568453788757324, + "num_tokens": 83586598.0, + "step": 2186 + }, + { + "epoch": 0.27820887927744564, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 1.9071153402328491, + "learning_rate": 9.266638406104281e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.8425478339195251, + "num_tokens": 83625996.0, + "step": 2187 + }, + { + "epoch": 0.2783360895560361, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.0274670124053955, + "learning_rate": 9.27087749046206e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8523130416870117, + "num_tokens": 83657322.0, + "step": 2188 + }, + { + "epoch": 0.27846329983462664, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.107576847076416, + "learning_rate": 9.275116574819839e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.852106511592865, + "num_tokens": 83693578.0, + "step": 2189 + }, + { + "epoch": 0.27859051011321717, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.1234397888183594, + "learning_rate": 9.279355659177617e-07, + "loss": 0.4689, + "mean_token_accuracy": 0.8530246019363403, + "num_tokens": 83727672.0, + "step": 2190 + }, + { + "epoch": 0.27871772039180764, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.181577682495117, + "learning_rate": 9.283594743535395e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8393652439117432, + "num_tokens": 83761405.0, + "step": 2191 + }, + { + "epoch": 0.2788449306703982, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.1342852115631104, + "learning_rate": 9.287833827893175e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8524897694587708, + "num_tokens": 83801564.0, + "step": 2192 + }, + { + "epoch": 0.2789721409489887, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.977799415588379, + "learning_rate": 9.292072912250953e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8463852405548096, + "num_tokens": 83834260.0, + "step": 2193 + }, + { + "epoch": 0.2790993512275792, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.774535059928894, + "learning_rate": 9.296311996608733e-07, + "loss": 0.4103, + "mean_token_accuracy": 0.8677723407745361, + "num_tokens": 83873518.0, + "step": 2194 + }, + { + "epoch": 0.2792265615061697, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.8599340915679932, + "learning_rate": 9.300551080966511e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8351183533668518, + "num_tokens": 83913902.0, + "step": 2195 + }, + { + "epoch": 0.27935377178476023, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.0014941692352295, + "learning_rate": 9.30479016532429e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8523869514465332, + "num_tokens": 83947952.0, + "step": 2196 + }, + { + "epoch": 0.2794809820633507, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.8786041736602783, + "learning_rate": 9.309029249682068e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8548339605331421, + "num_tokens": 83986057.0, + "step": 2197 + }, + { + "epoch": 0.27960819234194123, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.7621779441833496, + "learning_rate": 9.313268334039847e-07, + "loss": 0.4689, + "mean_token_accuracy": 0.8504118919372559, + "num_tokens": 84023881.0, + "step": 2198 + }, + { + "epoch": 0.27973540262053176, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.8822836875915527, + "learning_rate": 9.317507418397625e-07, + "loss": 0.4038, + "mean_token_accuracy": 0.872300922870636, + "num_tokens": 84061480.0, + "step": 2199 + }, + { + "epoch": 0.27986261289912223, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.938804030418396, + "learning_rate": 9.321746502755404e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.8423907160758972, + "num_tokens": 84101782.0, + "step": 2200 + }, + { + "epoch": 0.27998982317771276, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.0210280418395996, + "learning_rate": 9.325985587113183e-07, + "loss": 0.4641, + "mean_token_accuracy": 0.8524047136306763, + "num_tokens": 84136125.0, + "step": 2201 + }, + { + "epoch": 0.2801170334563033, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.9924044609069824, + "learning_rate": 9.330224671470962e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.854095458984375, + "num_tokens": 84169748.0, + "step": 2202 + }, + { + "epoch": 0.28024424373489376, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0194218158721924, + "learning_rate": 9.334463755828741e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.8399308919906616, + "num_tokens": 84201292.0, + "step": 2203 + }, + { + "epoch": 0.2803714540134843, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9604495763778687, + "learning_rate": 9.338702840186519e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.8419804573059082, + "num_tokens": 84240652.0, + "step": 2204 + }, + { + "epoch": 0.2804986642920748, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9161874055862427, + "learning_rate": 9.342941924544298e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.847008466720581, + "num_tokens": 84280252.0, + "step": 2205 + }, + { + "epoch": 0.2806258745706653, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9576064348220825, + "learning_rate": 9.347181008902076e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.8680064082145691, + "num_tokens": 84316505.0, + "step": 2206 + }, + { + "epoch": 0.2807530848492558, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.004014015197754, + "learning_rate": 9.351420093259855e-07, + "loss": 0.4243, + "mean_token_accuracy": 0.8652657866477966, + "num_tokens": 84349990.0, + "step": 2207 + }, + { + "epoch": 0.28088029512784635, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.899582028388977, + "learning_rate": 9.355659177617634e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8645480871200562, + "num_tokens": 84381066.0, + "step": 2208 + }, + { + "epoch": 0.2810075054064368, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8943887948989868, + "learning_rate": 9.359898261975413e-07, + "loss": 0.4082, + "mean_token_accuracy": 0.8697072267532349, + "num_tokens": 84422540.0, + "step": 2209 + }, + { + "epoch": 0.28113471568502735, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.049321174621582, + "learning_rate": 9.364137346333192e-07, + "loss": 0.5334, + "mean_token_accuracy": 0.8315653800964355, + "num_tokens": 84462158.0, + "step": 2210 + }, + { + "epoch": 0.2812619259636179, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9144765138626099, + "learning_rate": 9.368376430690971e-07, + "loss": 0.4227, + "mean_token_accuracy": 0.863610029220581, + "num_tokens": 84495887.0, + "step": 2211 + }, + { + "epoch": 0.28138913624220835, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.034789562225342, + "learning_rate": 9.372615515048749e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8434702157974243, + "num_tokens": 84530176.0, + "step": 2212 + }, + { + "epoch": 0.2815163465207989, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8655738830566406, + "learning_rate": 9.376854599406528e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8438028693199158, + "num_tokens": 84574450.0, + "step": 2213 + }, + { + "epoch": 0.2816435567993894, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0161194801330566, + "learning_rate": 9.381093683764306e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8514595031738281, + "num_tokens": 84610661.0, + "step": 2214 + }, + { + "epoch": 0.2817707670779799, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.793760895729065, + "learning_rate": 9.385332768122085e-07, + "loss": 0.4463, + "mean_token_accuracy": 0.8562848567962646, + "num_tokens": 84651259.0, + "step": 2215 + }, + { + "epoch": 0.2818979773565704, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8850274085998535, + "learning_rate": 9.389571852479864e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8577954769134521, + "num_tokens": 84692389.0, + "step": 2216 + }, + { + "epoch": 0.28202518763516093, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0812487602233887, + "learning_rate": 9.393810936837643e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8554415702819824, + "num_tokens": 84731905.0, + "step": 2217 + }, + { + "epoch": 0.2821523979137514, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.8233444690704346, + "learning_rate": 9.398050021195422e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8493505120277405, + "num_tokens": 84773727.0, + "step": 2218 + }, + { + "epoch": 0.28227960819234194, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8430036306381226, + "learning_rate": 9.402289105553201e-07, + "loss": 0.4293, + "mean_token_accuracy": 0.8627630472183228, + "num_tokens": 84816771.0, + "step": 2219 + }, + { + "epoch": 0.28240681847093246, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0177502632141113, + "learning_rate": 9.406528189910978e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8560295701026917, + "num_tokens": 84853172.0, + "step": 2220 + }, + { + "epoch": 0.28253402874952294, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.8998396396636963, + "learning_rate": 9.410767274268757e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8588346242904663, + "num_tokens": 84891508.0, + "step": 2221 + }, + { + "epoch": 0.28266123902811346, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.896841287612915, + "learning_rate": 9.415006358626536e-07, + "loss": 0.531, + "mean_token_accuracy": 0.8303043842315674, + "num_tokens": 84931204.0, + "step": 2222 + }, + { + "epoch": 0.282788449306704, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0056169033050537, + "learning_rate": 9.419245442984314e-07, + "loss": 0.511, + "mean_token_accuracy": 0.8370985984802246, + "num_tokens": 84966782.0, + "step": 2223 + }, + { + "epoch": 0.28291565958529447, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.3628199100494385, + "learning_rate": 9.423484527342094e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.8515583872795105, + "num_tokens": 85001512.0, + "step": 2224 + }, + { + "epoch": 0.283042869863885, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.060584545135498, + "learning_rate": 9.427723611699872e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8520386219024658, + "num_tokens": 85036082.0, + "step": 2225 + }, + { + "epoch": 0.2831700801424755, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.8975850343704224, + "learning_rate": 9.431962696057652e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8507208228111267, + "num_tokens": 85079433.0, + "step": 2226 + }, + { + "epoch": 0.283297290421066, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.0201122760772705, + "learning_rate": 9.43620178041543e-07, + "loss": 0.4264, + "mean_token_accuracy": 0.86552894115448, + "num_tokens": 85116940.0, + "step": 2227 + }, + { + "epoch": 0.2834245006996565, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.870172381401062, + "learning_rate": 9.440440864773208e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8508231043815613, + "num_tokens": 85153328.0, + "step": 2228 + }, + { + "epoch": 0.28355171097824705, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.9420219659805298, + "learning_rate": 9.444679949130987e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8449227809906006, + "num_tokens": 85194059.0, + "step": 2229 + }, + { + "epoch": 0.2836789212568376, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.0578954219818115, + "learning_rate": 9.448919033488766e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8436360359191895, + "num_tokens": 85230820.0, + "step": 2230 + }, + { + "epoch": 0.28380613153542805, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.8434691429138184, + "learning_rate": 9.453158117846544e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8463440537452698, + "num_tokens": 85268864.0, + "step": 2231 + }, + { + "epoch": 0.2839333418140186, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0006160736083984, + "learning_rate": 9.457397202204324e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8408981561660767, + "num_tokens": 85311377.0, + "step": 2232 + }, + { + "epoch": 0.2840605520926091, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9394527673721313, + "learning_rate": 9.461636286562102e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8673285841941833, + "num_tokens": 85347082.0, + "step": 2233 + }, + { + "epoch": 0.2841877623711996, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.055149793624878, + "learning_rate": 9.465875370919882e-07, + "loss": 0.5105, + "mean_token_accuracy": 0.8413147926330566, + "num_tokens": 85384696.0, + "step": 2234 + }, + { + "epoch": 0.2843149726497901, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0852255821228027, + "learning_rate": 9.470114455277659e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8427884578704834, + "num_tokens": 85420192.0, + "step": 2235 + }, + { + "epoch": 0.28444218292838064, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8044545650482178, + "learning_rate": 9.474353539635438e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8551472425460815, + "num_tokens": 85462281.0, + "step": 2236 + }, + { + "epoch": 0.2845693932069711, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.8620129823684692, + "learning_rate": 9.478592623993217e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8550897240638733, + "num_tokens": 85495191.0, + "step": 2237 + }, + { + "epoch": 0.28469660348556164, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.9259446859359741, + "learning_rate": 9.482831708350996e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8570008277893066, + "num_tokens": 85533803.0, + "step": 2238 + }, + { + "epoch": 0.28482381376415217, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.9543826580047607, + "learning_rate": 9.487070792708775e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8450024724006653, + "num_tokens": 85573424.0, + "step": 2239 + }, + { + "epoch": 0.28495102404274264, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0344600677490234, + "learning_rate": 9.491309877066554e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.847551167011261, + "num_tokens": 85616423.0, + "step": 2240 + }, + { + "epoch": 0.28507823432133317, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.8969794511795044, + "learning_rate": 9.495548961424332e-07, + "loss": 0.5039, + "mean_token_accuracy": 0.8410265445709229, + "num_tokens": 85658410.0, + "step": 2241 + }, + { + "epoch": 0.2852054445999237, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.9340792894363403, + "learning_rate": 9.499788045782111e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.852414608001709, + "num_tokens": 85695767.0, + "step": 2242 + }, + { + "epoch": 0.28533265487851417, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.9891769886016846, + "learning_rate": 9.504027130139889e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.847256064414978, + "num_tokens": 85737771.0, + "step": 2243 + }, + { + "epoch": 0.2854598651571047, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9926384687423706, + "learning_rate": 9.508266214497667e-07, + "loss": 0.4267, + "mean_token_accuracy": 0.8677302002906799, + "num_tokens": 85774789.0, + "step": 2244 + }, + { + "epoch": 0.2855870754356952, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.8898098468780518, + "learning_rate": 9.512505298855447e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8550437688827515, + "num_tokens": 85817833.0, + "step": 2245 + }, + { + "epoch": 0.2857142857142857, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9046764373779297, + "learning_rate": 9.516744383213225e-07, + "loss": 0.4104, + "mean_token_accuracy": 0.8674314022064209, + "num_tokens": 85854488.0, + "step": 2246 + }, + { + "epoch": 0.2858414959928762, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.996376633644104, + "learning_rate": 9.520983467571005e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8447539806365967, + "num_tokens": 85893746.0, + "step": 2247 + }, + { + "epoch": 0.28596870627146675, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9467135667800903, + "learning_rate": 9.525222551928783e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8398511409759521, + "num_tokens": 85932798.0, + "step": 2248 + }, + { + "epoch": 0.2860959165500572, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0127341747283936, + "learning_rate": 9.529461636286562e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8596686124801636, + "num_tokens": 85964446.0, + "step": 2249 + }, + { + "epoch": 0.28622312682864776, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9580491781234741, + "learning_rate": 9.533700720644341e-07, + "loss": 0.4187, + "mean_token_accuracy": 0.8666130900382996, + "num_tokens": 86000587.0, + "step": 2250 + }, + { + "epoch": 0.2863503371072383, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.036778688430786, + "learning_rate": 9.537939805002118e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8500962853431702, + "num_tokens": 86042891.0, + "step": 2251 + }, + { + "epoch": 0.28647754738582876, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9677019119262695, + "learning_rate": 9.542178889359898e-07, + "loss": 0.449, + "mean_token_accuracy": 0.8568394184112549, + "num_tokens": 86079973.0, + "step": 2252 + }, + { + "epoch": 0.2866047576644193, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8183680772781372, + "learning_rate": 9.546417973717677e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8515648245811462, + "num_tokens": 86119767.0, + "step": 2253 + }, + { + "epoch": 0.2867319679430098, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8732157945632935, + "learning_rate": 9.550657058075455e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8457576036453247, + "num_tokens": 86163176.0, + "step": 2254 + }, + { + "epoch": 0.2868591782216003, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.048548460006714, + "learning_rate": 9.554896142433234e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8453472852706909, + "num_tokens": 86201523.0, + "step": 2255 + }, + { + "epoch": 0.2869863885001908, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.9831526279449463, + "learning_rate": 9.559135226791012e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.8360685706138611, + "num_tokens": 86240277.0, + "step": 2256 + }, + { + "epoch": 0.28711359877878134, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.756183385848999, + "learning_rate": 9.563374311148793e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8438696265220642, + "num_tokens": 86289714.0, + "step": 2257 + }, + { + "epoch": 0.2872408090573718, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.1903419494628906, + "learning_rate": 9.56761339550657e-07, + "loss": 0.447, + "mean_token_accuracy": 0.8592945337295532, + "num_tokens": 86320786.0, + "step": 2258 + }, + { + "epoch": 0.28736801933596234, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.8401373624801636, + "learning_rate": 9.57185247986435e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.8610266447067261, + "num_tokens": 86358002.0, + "step": 2259 + }, + { + "epoch": 0.28749522961455287, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.8335273265838623, + "learning_rate": 9.576091564222128e-07, + "loss": 0.4052, + "mean_token_accuracy": 0.8688405752182007, + "num_tokens": 86395334.0, + "step": 2260 + }, + { + "epoch": 0.28762243989314334, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.8632638454437256, + "learning_rate": 9.580330648579906e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8443629741668701, + "num_tokens": 86432616.0, + "step": 2261 + }, + { + "epoch": 0.2877496501717339, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.9391417503356934, + "learning_rate": 9.584569732937685e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8482042551040649, + "num_tokens": 86472293.0, + "step": 2262 + }, + { + "epoch": 0.2878768604503244, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.1330759525299072, + "learning_rate": 9.588808817295463e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.848127007484436, + "num_tokens": 86506304.0, + "step": 2263 + }, + { + "epoch": 0.2880040707289149, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.9767305850982666, + "learning_rate": 9.593047901653242e-07, + "loss": 0.465, + "mean_token_accuracy": 0.8505406379699707, + "num_tokens": 86540520.0, + "step": 2264 + }, + { + "epoch": 0.2881312810075054, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.9474531412124634, + "learning_rate": 9.597286986011022e-07, + "loss": 0.4258, + "mean_token_accuracy": 0.8632238507270813, + "num_tokens": 86576956.0, + "step": 2265 + }, + { + "epoch": 0.28825849128609593, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.044522762298584, + "learning_rate": 9.601526070368799e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.8402336835861206, + "num_tokens": 86613799.0, + "step": 2266 + }, + { + "epoch": 0.2883857015646864, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.9521329402923584, + "learning_rate": 9.60576515472658e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8557801246643066, + "num_tokens": 86650338.0, + "step": 2267 + }, + { + "epoch": 0.28851291184327693, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.9542087316513062, + "learning_rate": 9.610004239084358e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.850260853767395, + "num_tokens": 86691605.0, + "step": 2268 + }, + { + "epoch": 0.28864012212186746, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.9839071035385132, + "learning_rate": 9.614243323442136e-07, + "loss": 0.5314, + "mean_token_accuracy": 0.8305006623268127, + "num_tokens": 86728767.0, + "step": 2269 + }, + { + "epoch": 0.28876733240045793, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.9729877710342407, + "learning_rate": 9.618482407799915e-07, + "loss": 0.4412, + "mean_token_accuracy": 0.856616735458374, + "num_tokens": 86765240.0, + "step": 2270 + }, + { + "epoch": 0.28889454267904846, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.7188693284988403, + "learning_rate": 9.622721492157693e-07, + "loss": 0.4104, + "mean_token_accuracy": 0.8689998388290405, + "num_tokens": 86804042.0, + "step": 2271 + }, + { + "epoch": 0.289021752957639, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.004110097885132, + "learning_rate": 9.626960576515472e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8544467091560364, + "num_tokens": 86835220.0, + "step": 2272 + }, + { + "epoch": 0.28914896323622946, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.008134603500366, + "learning_rate": 9.63119966087325e-07, + "loss": 0.4469, + "mean_token_accuracy": 0.8542383313179016, + "num_tokens": 86873414.0, + "step": 2273 + }, + { + "epoch": 0.28927617351482, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.7788435220718384, + "learning_rate": 9.635438745231029e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.8609988689422607, + "num_tokens": 86914272.0, + "step": 2274 + }, + { + "epoch": 0.2894033837934105, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.8590199947357178, + "learning_rate": 9.63967782958881e-07, + "loss": 0.4765, + "mean_token_accuracy": 0.8457397818565369, + "num_tokens": 86952425.0, + "step": 2275 + }, + { + "epoch": 0.289530594072001, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.9146339893341064, + "learning_rate": 9.643916913946588e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8593772649765015, + "num_tokens": 86987842.0, + "step": 2276 + }, + { + "epoch": 0.2896578043505915, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.8365851640701294, + "learning_rate": 9.648155998304366e-07, + "loss": 0.4114, + "mean_token_accuracy": 0.8685932159423828, + "num_tokens": 87026750.0, + "step": 2277 + }, + { + "epoch": 0.28978501462918205, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.0226070880889893, + "learning_rate": 9.652395082662145e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8338233232498169, + "num_tokens": 87069685.0, + "step": 2278 + }, + { + "epoch": 0.2899122249077726, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.9567818641662598, + "learning_rate": 9.656634167019923e-07, + "loss": 0.4705, + "mean_token_accuracy": 0.846712052822113, + "num_tokens": 87107054.0, + "step": 2279 + }, + { + "epoch": 0.29003943518636305, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.7996548414230347, + "learning_rate": 9.660873251377701e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8480288982391357, + "num_tokens": 87146254.0, + "step": 2280 + }, + { + "epoch": 0.2901666454649536, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.0799152851104736, + "learning_rate": 9.66511233573548e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8440079689025879, + "num_tokens": 87182404.0, + "step": 2281 + }, + { + "epoch": 0.2902938557435441, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.8650164604187012, + "learning_rate": 9.669351420093258e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.8614469170570374, + "num_tokens": 87216551.0, + "step": 2282 + }, + { + "epoch": 0.2904210660221346, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.080923557281494, + "learning_rate": 9.67359050445104e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8525808453559875, + "num_tokens": 87249402.0, + "step": 2283 + }, + { + "epoch": 0.2905482763007251, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.1303648948669434, + "learning_rate": 9.677829588808817e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8557098507881165, + "num_tokens": 87281846.0, + "step": 2284 + }, + { + "epoch": 0.29067548657931563, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.0528676509857178, + "learning_rate": 9.682068673166596e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8420543670654297, + "num_tokens": 87315765.0, + "step": 2285 + }, + { + "epoch": 0.2908026968579061, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.08156418800354, + "learning_rate": 9.686307757524374e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8524335622787476, + "num_tokens": 87353668.0, + "step": 2286 + }, + { + "epoch": 0.29092990713649663, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9146044254302979, + "learning_rate": 9.690546841882153e-07, + "loss": 0.3906, + "mean_token_accuracy": 0.8746009469032288, + "num_tokens": 87390747.0, + "step": 2287 + }, + { + "epoch": 0.29105711741508716, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.9951024055480957, + "learning_rate": 9.694785926239931e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8526276350021362, + "num_tokens": 87430686.0, + "step": 2288 + }, + { + "epoch": 0.29118432769367764, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.001427173614502, + "learning_rate": 9.69902501059771e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8392630219459534, + "num_tokens": 87465290.0, + "step": 2289 + }, + { + "epoch": 0.29131153797226816, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.8250348567962646, + "learning_rate": 9.703264094955488e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8477381467819214, + "num_tokens": 87504553.0, + "step": 2290 + }, + { + "epoch": 0.2914387482508587, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.8872451782226562, + "learning_rate": 9.707503179313269e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8579452037811279, + "num_tokens": 87544896.0, + "step": 2291 + }, + { + "epoch": 0.29156595852944917, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.0868916511535645, + "learning_rate": 9.711742263671047e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8573074340820312, + "num_tokens": 87582075.0, + "step": 2292 + }, + { + "epoch": 0.2916931688080397, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.8214325904846191, + "learning_rate": 9.715981348028826e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8583802580833435, + "num_tokens": 87623117.0, + "step": 2293 + }, + { + "epoch": 0.2918203790866302, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.156040668487549, + "learning_rate": 9.720220432386604e-07, + "loss": 0.455, + "mean_token_accuracy": 0.8556480407714844, + "num_tokens": 87656263.0, + "step": 2294 + }, + { + "epoch": 0.2919475893652207, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.822569727897644, + "learning_rate": 9.724459516744383e-07, + "loss": 0.4311, + "mean_token_accuracy": 0.8629419803619385, + "num_tokens": 87695852.0, + "step": 2295 + }, + { + "epoch": 0.2920747996438112, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.0819294452667236, + "learning_rate": 9.728698601102161e-07, + "loss": 0.4281, + "mean_token_accuracy": 0.863561749458313, + "num_tokens": 87734331.0, + "step": 2296 + }, + { + "epoch": 0.29220200992240175, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.1341094970703125, + "learning_rate": 9.73293768545994e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.849370002746582, + "num_tokens": 87768447.0, + "step": 2297 + }, + { + "epoch": 0.2923292202009922, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.8796662092208862, + "learning_rate": 9.737176769817718e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8471717238426208, + "num_tokens": 87813213.0, + "step": 2298 + }, + { + "epoch": 0.29245643047958275, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9899004697799683, + "learning_rate": 9.741415854175499e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.8381731510162354, + "num_tokens": 87851336.0, + "step": 2299 + }, + { + "epoch": 0.2925836407581733, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.352292060852051, + "learning_rate": 9.745654938533277e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8476345539093018, + "num_tokens": 87888474.0, + "step": 2300 + }, + { + "epoch": 0.29271085103676375, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9158059358596802, + "learning_rate": 9.749894022891056e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8429471850395203, + "num_tokens": 87928232.0, + "step": 2301 + }, + { + "epoch": 0.2928380613153543, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.8797348737716675, + "learning_rate": 9.754133107248834e-07, + "loss": 0.4203, + "mean_token_accuracy": 0.8655033111572266, + "num_tokens": 87966982.0, + "step": 2302 + }, + { + "epoch": 0.2929652715939448, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.985663890838623, + "learning_rate": 9.758372191606612e-07, + "loss": 0.5074, + "mean_token_accuracy": 0.8369009494781494, + "num_tokens": 88003550.0, + "step": 2303 + }, + { + "epoch": 0.2930924818725353, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.960578203201294, + "learning_rate": 9.76261127596439e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8669660687446594, + "num_tokens": 88038038.0, + "step": 2304 + }, + { + "epoch": 0.2932196921511258, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9178543090820312, + "learning_rate": 9.76685036032217e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8417850732803345, + "num_tokens": 88074627.0, + "step": 2305 + }, + { + "epoch": 0.29334690242971634, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.7895420789718628, + "learning_rate": 9.771089444679948e-07, + "loss": 0.4332, + "mean_token_accuracy": 0.8608898520469666, + "num_tokens": 88116772.0, + "step": 2306 + }, + { + "epoch": 0.2934741127083068, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.1314809322357178, + "learning_rate": 9.775328529037728e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8471411466598511, + "num_tokens": 88152888.0, + "step": 2307 + }, + { + "epoch": 0.29360132298689734, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.610238552093506, + "learning_rate": 9.779567613395507e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8428727388381958, + "num_tokens": 88184544.0, + "step": 2308 + }, + { + "epoch": 0.29372853326548787, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.0987610816955566, + "learning_rate": 9.783806697753285e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8574962615966797, + "num_tokens": 88221603.0, + "step": 2309 + }, + { + "epoch": 0.29385574354407834, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.1770269870758057, + "learning_rate": 9.788045782111064e-07, + "loss": 0.4246, + "mean_token_accuracy": 0.8638516664505005, + "num_tokens": 88265109.0, + "step": 2310 + }, + { + "epoch": 0.29398295382266887, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.7483028173446655, + "learning_rate": 9.792284866468842e-07, + "loss": 0.4152, + "mean_token_accuracy": 0.8645980358123779, + "num_tokens": 88302756.0, + "step": 2311 + }, + { + "epoch": 0.2941101641012594, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.8753999471664429, + "learning_rate": 9.79652395082662e-07, + "loss": 0.4158, + "mean_token_accuracy": 0.8621200919151306, + "num_tokens": 88335605.0, + "step": 2312 + }, + { + "epoch": 0.29423737437984987, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.0248959064483643, + "learning_rate": 9.8007630351844e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8461240530014038, + "num_tokens": 88369447.0, + "step": 2313 + }, + { + "epoch": 0.2943645846584404, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.0330185890197754, + "learning_rate": 9.805002119542178e-07, + "loss": 0.5149, + "mean_token_accuracy": 0.8370387554168701, + "num_tokens": 88404442.0, + "step": 2314 + }, + { + "epoch": 0.2944917949370309, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.8324023485183716, + "learning_rate": 9.809241203899958e-07, + "loss": 0.4616, + "mean_token_accuracy": 0.8550715446472168, + "num_tokens": 88445870.0, + "step": 2315 + }, + { + "epoch": 0.2946190052156214, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.0848617553710938, + "learning_rate": 9.813480288257737e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8447170257568359, + "num_tokens": 88480994.0, + "step": 2316 + }, + { + "epoch": 0.2947462154942119, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.8715285062789917, + "learning_rate": 9.817719372615515e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8626134395599365, + "num_tokens": 88519666.0, + "step": 2317 + }, + { + "epoch": 0.29487342577280246, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.096846342086792, + "learning_rate": 9.821958456973294e-07, + "loss": 0.4056, + "mean_token_accuracy": 0.8688961267471313, + "num_tokens": 88553855.0, + "step": 2318 + }, + { + "epoch": 0.29500063605139293, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.1165285110473633, + "learning_rate": 9.826197541331072e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.8664826154708862, + "num_tokens": 88589053.0, + "step": 2319 + }, + { + "epoch": 0.29512784632998346, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.085352897644043, + "learning_rate": 9.83043662568885e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8488982319831848, + "num_tokens": 88627067.0, + "step": 2320 + }, + { + "epoch": 0.295255056608574, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.7085736989974976, + "learning_rate": 9.83467571004663e-07, + "loss": 0.4239, + "mean_token_accuracy": 0.8640283346176147, + "num_tokens": 88668030.0, + "step": 2321 + }, + { + "epoch": 0.29538226688716446, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.872174859046936, + "learning_rate": 9.838914794404407e-07, + "loss": 0.4229, + "mean_token_accuracy": 0.865717887878418, + "num_tokens": 88708889.0, + "step": 2322 + }, + { + "epoch": 0.295509477165755, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.995682954788208, + "learning_rate": 9.843153878762188e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8469263911247253, + "num_tokens": 88745689.0, + "step": 2323 + }, + { + "epoch": 0.2956366874443455, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9254117012023926, + "learning_rate": 9.847392963119966e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8591779470443726, + "num_tokens": 88784667.0, + "step": 2324 + }, + { + "epoch": 0.295763897722936, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.8032671213150024, + "learning_rate": 9.851632047477745e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.8532861471176147, + "num_tokens": 88827334.0, + "step": 2325 + }, + { + "epoch": 0.2958911080015265, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9051120281219482, + "learning_rate": 9.855871131835523e-07, + "loss": 0.3875, + "mean_token_accuracy": 0.874933123588562, + "num_tokens": 88863744.0, + "step": 2326 + }, + { + "epoch": 0.29601831828011704, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.8923401832580566, + "learning_rate": 9.860110216193302e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.8673272728919983, + "num_tokens": 88903693.0, + "step": 2327 + }, + { + "epoch": 0.2961455285587075, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9323818683624268, + "learning_rate": 9.86434930055108e-07, + "loss": 0.4151, + "mean_token_accuracy": 0.867393970489502, + "num_tokens": 88936668.0, + "step": 2328 + }, + { + "epoch": 0.29627273883729804, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9945138692855835, + "learning_rate": 9.868588384908859e-07, + "loss": 0.4248, + "mean_token_accuracy": 0.8607867360115051, + "num_tokens": 88970920.0, + "step": 2329 + }, + { + "epoch": 0.2963999491158886, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.9049228429794312, + "learning_rate": 9.872827469266637e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8606392741203308, + "num_tokens": 89010856.0, + "step": 2330 + }, + { + "epoch": 0.2965271593944791, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.0268454551696777, + "learning_rate": 9.877066553624418e-07, + "loss": 0.4352, + "mean_token_accuracy": 0.860049843788147, + "num_tokens": 89047109.0, + "step": 2331 + }, + { + "epoch": 0.2966543696730696, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 1.8655939102172852, + "learning_rate": 9.881305637982196e-07, + "loss": 0.4454, + "mean_token_accuracy": 0.8591532707214355, + "num_tokens": 89087859.0, + "step": 2332 + }, + { + "epoch": 0.2967815799516601, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.037492036819458, + "learning_rate": 9.885544722339975e-07, + "loss": 0.4262, + "mean_token_accuracy": 0.863757848739624, + "num_tokens": 89123601.0, + "step": 2333 + }, + { + "epoch": 0.29690879023025063, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.8419514894485474, + "learning_rate": 9.889783806697753e-07, + "loss": 0.4103, + "mean_token_accuracy": 0.8692470788955688, + "num_tokens": 89160133.0, + "step": 2334 + }, + { + "epoch": 0.2970360005088411, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.864205002784729, + "learning_rate": 9.894022891055532e-07, + "loss": 0.4047, + "mean_token_accuracy": 0.8700882196426392, + "num_tokens": 89196731.0, + "step": 2335 + }, + { + "epoch": 0.29716321078743163, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.949925422668457, + "learning_rate": 9.89826197541331e-07, + "loss": 0.3939, + "mean_token_accuracy": 0.8729699850082397, + "num_tokens": 89230159.0, + "step": 2336 + }, + { + "epoch": 0.29729042106602216, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.9867050647735596, + "learning_rate": 9.902501059771089e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.8523162603378296, + "num_tokens": 89264648.0, + "step": 2337 + }, + { + "epoch": 0.29741763134461263, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.8797684907913208, + "learning_rate": 9.906740144128867e-07, + "loss": 0.4544, + "mean_token_accuracy": 0.8530987501144409, + "num_tokens": 89302505.0, + "step": 2338 + }, + { + "epoch": 0.29754484162320316, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.9198131561279297, + "learning_rate": 9.910979228486648e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8670014142990112, + "num_tokens": 89338079.0, + "step": 2339 + }, + { + "epoch": 0.2976720519017937, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.0269064903259277, + "learning_rate": 9.915218312844426e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8514466881752014, + "num_tokens": 89376717.0, + "step": 2340 + }, + { + "epoch": 0.29779926218038416, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.490492105484009, + "learning_rate": 9.919457397202205e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8500007390975952, + "num_tokens": 89416164.0, + "step": 2341 + }, + { + "epoch": 0.2979264724589747, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.2985916137695312, + "learning_rate": 9.923696481559983e-07, + "loss": 0.4259, + "mean_token_accuracy": 0.8652881383895874, + "num_tokens": 89446491.0, + "step": 2342 + }, + { + "epoch": 0.2980536827375652, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.1189050674438477, + "learning_rate": 9.927935565917761e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8467226028442383, + "num_tokens": 89483637.0, + "step": 2343 + }, + { + "epoch": 0.2981808930161557, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.8042659759521484, + "learning_rate": 9.93217465027554e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8552863597869873, + "num_tokens": 89526532.0, + "step": 2344 + }, + { + "epoch": 0.2983081032947462, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.1098480224609375, + "learning_rate": 9.936413734633318e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.8384270071983337, + "num_tokens": 89558353.0, + "step": 2345 + }, + { + "epoch": 0.29843531357333675, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.8451344966888428, + "learning_rate": 9.940652818991097e-07, + "loss": 0.4183, + "mean_token_accuracy": 0.8662954568862915, + "num_tokens": 89598727.0, + "step": 2346 + }, + { + "epoch": 0.2985625238519272, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.9560884237289429, + "learning_rate": 9.944891903348877e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8461401462554932, + "num_tokens": 89640093.0, + "step": 2347 + }, + { + "epoch": 0.29868973413051775, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.8577028512954712, + "learning_rate": 9.949130987706656e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.860792338848114, + "num_tokens": 89681401.0, + "step": 2348 + }, + { + "epoch": 0.2988169444091083, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.7767829895019531, + "learning_rate": 9.953370072064432e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8625423908233643, + "num_tokens": 89720366.0, + "step": 2349 + }, + { + "epoch": 0.29894415468769875, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.8184325695037842, + "learning_rate": 9.957609156422213e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8398516178131104, + "num_tokens": 89762234.0, + "step": 2350 + }, + { + "epoch": 0.2990713649662893, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.126417636871338, + "learning_rate": 9.961848240779991e-07, + "loss": 0.4469, + "mean_token_accuracy": 0.8578841686248779, + "num_tokens": 89792208.0, + "step": 2351 + }, + { + "epoch": 0.2991985752448798, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.7999978065490723, + "learning_rate": 9.96608732513777e-07, + "loss": 0.4253, + "mean_token_accuracy": 0.8666139841079712, + "num_tokens": 89833246.0, + "step": 2352 + }, + { + "epoch": 0.2993257855234703, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.832987904548645, + "learning_rate": 9.970326409495548e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8453918099403381, + "num_tokens": 89873271.0, + "step": 2353 + }, + { + "epoch": 0.2994529958020608, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.9016084671020508, + "learning_rate": 9.974565493853327e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8406310081481934, + "num_tokens": 89913069.0, + "step": 2354 + }, + { + "epoch": 0.29958020608065133, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.1695148944854736, + "learning_rate": 9.978804578211107e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8462367057800293, + "num_tokens": 89948915.0, + "step": 2355 + }, + { + "epoch": 0.2997074163592418, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.0463340282440186, + "learning_rate": 9.983043662568886e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.849263608455658, + "num_tokens": 89982123.0, + "step": 2356 + }, + { + "epoch": 0.29983462663783234, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.010744333267212, + "learning_rate": 9.987282746926662e-07, + "loss": 0.4301, + "mean_token_accuracy": 0.862764835357666, + "num_tokens": 90018259.0, + "step": 2357 + }, + { + "epoch": 0.29996183691642286, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.0824100971221924, + "learning_rate": 9.991521831284443e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8576164245605469, + "num_tokens": 90059041.0, + "step": 2358 + }, + { + "epoch": 0.30008904719501334, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.9677965641021729, + "learning_rate": 9.995760915642221e-07, + "loss": 0.3985, + "mean_token_accuracy": 0.8732434511184692, + "num_tokens": 90089463.0, + "step": 2359 + }, + { + "epoch": 0.30021625747360386, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 16.595293045043945, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8626689910888672, + "num_tokens": 90125691.0, + "step": 2360 + }, + { + "epoch": 0.3003434677521944, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.9360166788101196, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8540160655975342, + "num_tokens": 90161778.0, + "step": 2361 + }, + { + "epoch": 0.30047067803078487, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.8776249885559082, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.841386079788208, + "num_tokens": 90202243.0, + "step": 2362 + }, + { + "epoch": 0.3005978883093754, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.8643038272857666, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8456090688705444, + "num_tokens": 90248924.0, + "step": 2363 + }, + { + "epoch": 0.3007250985879659, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.1066956520080566, + "learning_rate": 1e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8264439105987549, + "num_tokens": 90281988.0, + "step": 2364 + }, + { + "epoch": 0.3008523088665564, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.3224194049835205, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8514961004257202, + "num_tokens": 90317050.0, + "step": 2365 + }, + { + "epoch": 0.3009795191451469, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.730635166168213, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8631662130355835, + "num_tokens": 90358064.0, + "step": 2366 + }, + { + "epoch": 0.30110672942373745, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.9248217344284058, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8440333008766174, + "num_tokens": 90400692.0, + "step": 2367 + }, + { + "epoch": 0.3012339397023279, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.089629650115967, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8466325998306274, + "num_tokens": 90432207.0, + "step": 2368 + }, + { + "epoch": 0.30136114998091845, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.054051399230957, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8526593446731567, + "num_tokens": 90466081.0, + "step": 2369 + }, + { + "epoch": 0.301488360259509, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.9076738357543945, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8615483045578003, + "num_tokens": 90501963.0, + "step": 2370 + }, + { + "epoch": 0.30161557053809945, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.9855892658233643, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8649482131004333, + "num_tokens": 90540651.0, + "step": 2371 + }, + { + "epoch": 0.30174278081669, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.9169038534164429, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8467761278152466, + "num_tokens": 90585105.0, + "step": 2372 + }, + { + "epoch": 0.3018699910952805, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 16.608163833618164, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8510500192642212, + "num_tokens": 90626773.0, + "step": 2373 + }, + { + "epoch": 0.301997201373871, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.879690170288086, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8695566058158875, + "num_tokens": 90665974.0, + "step": 2374 + }, + { + "epoch": 0.3021244116524615, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.0283279418945312, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8497179746627808, + "num_tokens": 90706801.0, + "step": 2375 + }, + { + "epoch": 0.30225162193105204, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.0321967601776123, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8510682582855225, + "num_tokens": 90743635.0, + "step": 2376 + }, + { + "epoch": 0.3023788322096425, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.9847838878631592, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8551733493804932, + "num_tokens": 90782797.0, + "step": 2377 + }, + { + "epoch": 0.30250604248823304, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.8222047090530396, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8483811616897583, + "num_tokens": 90822661.0, + "step": 2378 + }, + { + "epoch": 0.30263325276682357, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.0093562602996826, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8359246253967285, + "num_tokens": 90857804.0, + "step": 2379 + }, + { + "epoch": 0.3027604630454141, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.1113181114196777, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8532150387763977, + "num_tokens": 90895778.0, + "step": 2380 + }, + { + "epoch": 0.30288767332400457, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.9289979934692383, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8612321019172668, + "num_tokens": 90934463.0, + "step": 2381 + }, + { + "epoch": 0.3030148836025951, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 1.9539834260940552, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.85804283618927, + "num_tokens": 90967888.0, + "step": 2382 + }, + { + "epoch": 0.3031420938811856, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 1.8972471952438354, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8613930344581604, + "num_tokens": 91005200.0, + "step": 2383 + }, + { + "epoch": 0.3032693041597761, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.0384016036987305, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8376166820526123, + "num_tokens": 91037840.0, + "step": 2384 + }, + { + "epoch": 0.3033965144383666, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 1.966173529624939, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.841360330581665, + "num_tokens": 91073336.0, + "step": 2385 + }, + { + "epoch": 0.30352372471695716, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.0152313709259033, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.856906533241272, + "num_tokens": 91107001.0, + "step": 2386 + }, + { + "epoch": 0.30365093499554763, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.917504906654358, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8619179129600525, + "num_tokens": 91143937.0, + "step": 2387 + }, + { + "epoch": 0.30377814527413816, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.7719532251358032, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8453062176704407, + "num_tokens": 91185391.0, + "step": 2388 + }, + { + "epoch": 0.3039053555527287, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.8489952087402344, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8549314141273499, + "num_tokens": 91227653.0, + "step": 2389 + }, + { + "epoch": 0.30403256583131916, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.854346752166748, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8738210201263428, + "num_tokens": 91263610.0, + "step": 2390 + }, + { + "epoch": 0.3041597761099097, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.003919839859009, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8482244610786438, + "num_tokens": 91296880.0, + "step": 2391 + }, + { + "epoch": 0.3042869863885002, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.028330087661743, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8440166711807251, + "num_tokens": 91336242.0, + "step": 2392 + }, + { + "epoch": 0.3044141966670907, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.8581970930099487, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8629518747329712, + "num_tokens": 91376527.0, + "step": 2393 + }, + { + "epoch": 0.3045414069456812, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.1315813064575195, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8459265828132629, + "num_tokens": 91408415.0, + "step": 2394 + }, + { + "epoch": 0.30466861722427174, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.8296724557876587, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.856593132019043, + "num_tokens": 91449189.0, + "step": 2395 + }, + { + "epoch": 0.3047958275028622, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.9770784378051758, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8518269062042236, + "num_tokens": 91488599.0, + "step": 2396 + }, + { + "epoch": 0.30492303778145274, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.9299579858779907, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8580844402313232, + "num_tokens": 91525138.0, + "step": 2397 + }, + { + "epoch": 0.30505024806004327, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.7760698795318604, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8559327125549316, + "num_tokens": 91567444.0, + "step": 2398 + }, + { + "epoch": 0.30517745833863374, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.0639076232910156, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8522302508354187, + "num_tokens": 91602926.0, + "step": 2399 + }, + { + "epoch": 0.3053046686172243, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.001462459564209, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8660130500793457, + "num_tokens": 91635901.0, + "step": 2400 + }, + { + "epoch": 0.3054318788958148, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.8747135400772095, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8568710088729858, + "num_tokens": 91679542.0, + "step": 2401 + }, + { + "epoch": 0.3055590891744053, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.8186695575714111, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8492870330810547, + "num_tokens": 91723022.0, + "step": 2402 + }, + { + "epoch": 0.3056862994529958, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.9882051944732666, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8506859540939331, + "num_tokens": 91760664.0, + "step": 2403 + }, + { + "epoch": 0.30581350973158633, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.0419909954071045, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8507148027420044, + "num_tokens": 91805570.0, + "step": 2404 + }, + { + "epoch": 0.3059407200101768, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.9501618146896362, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.847531795501709, + "num_tokens": 91843342.0, + "step": 2405 + }, + { + "epoch": 0.30606793028876733, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.9659711122512817, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8455160856246948, + "num_tokens": 91883872.0, + "step": 2406 + }, + { + "epoch": 0.30619514056735786, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 3.1917057037353516, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.853317141532898, + "num_tokens": 91923174.0, + "step": 2407 + }, + { + "epoch": 0.30632235084594833, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.8887779712677002, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8538605570793152, + "num_tokens": 91963462.0, + "step": 2408 + }, + { + "epoch": 0.30644956112453886, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.8399282693862915, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8570961952209473, + "num_tokens": 92000907.0, + "step": 2409 + }, + { + "epoch": 0.3065767714031294, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 4.804548740386963, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8578377366065979, + "num_tokens": 92036110.0, + "step": 2410 + }, + { + "epoch": 0.30670398168171986, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.0528132915496826, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8509154319763184, + "num_tokens": 92080647.0, + "step": 2411 + }, + { + "epoch": 0.3068311919603104, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.122926712036133, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8580025434494019, + "num_tokens": 92115407.0, + "step": 2412 + }, + { + "epoch": 0.3069584022389009, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.45817232131958, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.854697585105896, + "num_tokens": 92149660.0, + "step": 2413 + }, + { + "epoch": 0.3070856125174914, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.833113193511963, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8573817014694214, + "num_tokens": 92191581.0, + "step": 2414 + }, + { + "epoch": 0.3072128227960819, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.80939519405365, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8498660326004028, + "num_tokens": 92230435.0, + "step": 2415 + }, + { + "epoch": 0.30734003307467245, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.0383214950561523, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.844010591506958, + "num_tokens": 92269545.0, + "step": 2416 + }, + { + "epoch": 0.3074672433532629, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.7803503274917603, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8618208765983582, + "num_tokens": 92307447.0, + "step": 2417 + }, + { + "epoch": 0.30759445363185345, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.7099615335464478, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8530938625335693, + "num_tokens": 92349441.0, + "step": 2418 + }, + { + "epoch": 0.307721663910444, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.0403075218200684, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8584954738616943, + "num_tokens": 92380405.0, + "step": 2419 + }, + { + "epoch": 0.30784887418903445, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.9020251035690308, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8532505631446838, + "num_tokens": 92419596.0, + "step": 2420 + }, + { + "epoch": 0.307976084467625, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.8568555116653442, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8473951816558838, + "num_tokens": 92454590.0, + "step": 2421 + }, + { + "epoch": 0.3081032947462155, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.0120303630828857, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8517088890075684, + "num_tokens": 92491640.0, + "step": 2422 + }, + { + "epoch": 0.308230505024806, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.931693434715271, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8468117713928223, + "num_tokens": 92528682.0, + "step": 2423 + }, + { + "epoch": 0.3083577153033965, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.314690589904785, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8614028692245483, + "num_tokens": 92566368.0, + "step": 2424 + }, + { + "epoch": 0.30848492558198704, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.2857964038848877, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8327528238296509, + "num_tokens": 92601380.0, + "step": 2425 + }, + { + "epoch": 0.3086121358605775, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.0694258213043213, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8704533576965332, + "num_tokens": 92639724.0, + "step": 2426 + }, + { + "epoch": 0.30873934613916804, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.985620141029358, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8596771955490112, + "num_tokens": 92677450.0, + "step": 2427 + }, + { + "epoch": 0.30886655641775856, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.9088482856750488, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8638107776641846, + "num_tokens": 92714681.0, + "step": 2428 + }, + { + "epoch": 0.3089937666963491, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.8724972009658813, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8511543273925781, + "num_tokens": 92749925.0, + "step": 2429 + }, + { + "epoch": 0.30912097697493957, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.8884674310684204, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8581677079200745, + "num_tokens": 92791720.0, + "step": 2430 + }, + { + "epoch": 0.3092481872535301, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.968812108039856, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8469597101211548, + "num_tokens": 92826179.0, + "step": 2431 + }, + { + "epoch": 0.3093753975321206, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.7654722929000854, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8537279367446899, + "num_tokens": 92872751.0, + "step": 2432 + }, + { + "epoch": 0.3095026078107111, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.9467350244522095, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8498202562332153, + "num_tokens": 92907470.0, + "step": 2433 + }, + { + "epoch": 0.3096298180893016, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.0028176307678223, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8593212366104126, + "num_tokens": 92943321.0, + "step": 2434 + }, + { + "epoch": 0.30975702836789215, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.999554991722107, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8563269376754761, + "num_tokens": 92982136.0, + "step": 2435 + }, + { + "epoch": 0.3098842386464826, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.222588062286377, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8441370725631714, + "num_tokens": 93015974.0, + "step": 2436 + }, + { + "epoch": 0.31001144892507315, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.9513273239135742, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8578572273254395, + "num_tokens": 93054010.0, + "step": 2437 + }, + { + "epoch": 0.3101386592036637, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.1496973037719727, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8632607460021973, + "num_tokens": 93090226.0, + "step": 2438 + }, + { + "epoch": 0.31026586948225415, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.0692508220672607, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8519043922424316, + "num_tokens": 93130242.0, + "step": 2439 + }, + { + "epoch": 0.3103930797608447, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.0563950538635254, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8498879671096802, + "num_tokens": 93170990.0, + "step": 2440 + }, + { + "epoch": 0.3105202900394352, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.9719291925430298, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8465613722801208, + "num_tokens": 93205478.0, + "step": 2441 + }, + { + "epoch": 0.3106475003180257, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.89432692527771, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8456239700317383, + "num_tokens": 93240952.0, + "step": 2442 + }, + { + "epoch": 0.3107747105966162, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.8366283178329468, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8458445072174072, + "num_tokens": 93278882.0, + "step": 2443 + }, + { + "epoch": 0.31090192087520674, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.967449426651001, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8471392393112183, + "num_tokens": 93324573.0, + "step": 2444 + }, + { + "epoch": 0.3110291311537972, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.167834520339966, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8672455549240112, + "num_tokens": 93361049.0, + "step": 2445 + }, + { + "epoch": 0.31115634143238774, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.9385825395584106, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8511453866958618, + "num_tokens": 93401748.0, + "step": 2446 + }, + { + "epoch": 0.31128355171097827, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.0283710956573486, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8660675883293152, + "num_tokens": 93434857.0, + "step": 2447 + }, + { + "epoch": 0.31141076198956874, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.8518694639205933, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8529443740844727, + "num_tokens": 93474681.0, + "step": 2448 + }, + { + "epoch": 0.31153797226815927, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.9113777875900269, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8532784581184387, + "num_tokens": 93510627.0, + "step": 2449 + }, + { + "epoch": 0.3116651825467498, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.765244722366333, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8453300595283508, + "num_tokens": 93551598.0, + "step": 2450 + }, + { + "epoch": 0.31179239282534027, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.1202292442321777, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8472703695297241, + "num_tokens": 93586622.0, + "step": 2451 + }, + { + "epoch": 0.3119196031039308, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.9492168426513672, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8432604670524597, + "num_tokens": 93626496.0, + "step": 2452 + }, + { + "epoch": 0.3120468133825213, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.421649694442749, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8547762036323547, + "num_tokens": 93665641.0, + "step": 2453 + }, + { + "epoch": 0.3121740236611118, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.9683057069778442, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8596439361572266, + "num_tokens": 93702135.0, + "step": 2454 + }, + { + "epoch": 0.3123012339397023, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.988062858581543, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8600115776062012, + "num_tokens": 93738226.0, + "step": 2455 + }, + { + "epoch": 0.31242844421829286, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.0422251224517822, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8555551767349243, + "num_tokens": 93780273.0, + "step": 2456 + }, + { + "epoch": 0.31255565449688333, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8657861948013306, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8417699337005615, + "num_tokens": 93821977.0, + "step": 2457 + }, + { + "epoch": 0.31268286477547386, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.8714663982391357, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.862544059753418, + "num_tokens": 93856084.0, + "step": 2458 + }, + { + "epoch": 0.3128100750540644, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.8638205528259277, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8571072220802307, + "num_tokens": 93893491.0, + "step": 2459 + }, + { + "epoch": 0.31293728533265486, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.7104791402816772, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8424571752548218, + "num_tokens": 93940240.0, + "step": 2460 + }, + { + "epoch": 0.3130644956112454, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.022268295288086, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8473195433616638, + "num_tokens": 93974504.0, + "step": 2461 + }, + { + "epoch": 0.3131917058898359, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.5230941772460938, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8568907976150513, + "num_tokens": 94009208.0, + "step": 2462 + }, + { + "epoch": 0.3133189161684264, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.8621169328689575, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8545422554016113, + "num_tokens": 94049249.0, + "step": 2463 + }, + { + "epoch": 0.3134461264470169, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.074967622756958, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8554442524909973, + "num_tokens": 94085553.0, + "step": 2464 + }, + { + "epoch": 0.31357333672560744, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.9655874967575073, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8336132764816284, + "num_tokens": 94127531.0, + "step": 2465 + }, + { + "epoch": 0.3137005470041979, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.483982563018799, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8662371635437012, + "num_tokens": 94171191.0, + "step": 2466 + }, + { + "epoch": 0.31382775728278844, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.8474512100219727, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8590765595436096, + "num_tokens": 94212090.0, + "step": 2467 + }, + { + "epoch": 0.313954967561379, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.861168622970581, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8731850981712341, + "num_tokens": 94246835.0, + "step": 2468 + }, + { + "epoch": 0.31408217783996945, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8776425123214722, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8530839085578918, + "num_tokens": 94284581.0, + "step": 2469 + }, + { + "epoch": 0.31420938811856, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.9227081537246704, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8642646074295044, + "num_tokens": 94323050.0, + "step": 2470 + }, + { + "epoch": 0.3143365983971505, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8207193613052368, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8492249250411987, + "num_tokens": 94364702.0, + "step": 2471 + }, + { + "epoch": 0.314463808675741, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.74716317653656, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8600602149963379, + "num_tokens": 94404735.0, + "step": 2472 + }, + { + "epoch": 0.3145910189543315, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.9263875484466553, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8403616547584534, + "num_tokens": 94444683.0, + "step": 2473 + }, + { + "epoch": 0.31471822923292203, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8210299015045166, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8558943271636963, + "num_tokens": 94484392.0, + "step": 2474 + }, + { + "epoch": 0.3148454395115125, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.002997398376465, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8605412244796753, + "num_tokens": 94524860.0, + "step": 2475 + }, + { + "epoch": 0.31497264979010303, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.8941850662231445, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8515494465827942, + "num_tokens": 94565076.0, + "step": 2476 + }, + { + "epoch": 0.31509986006869356, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8721508979797363, + "learning_rate": 1e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8316290378570557, + "num_tokens": 94606191.0, + "step": 2477 + }, + { + "epoch": 0.31522707034728403, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8204858303070068, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8472570180892944, + "num_tokens": 94648021.0, + "step": 2478 + }, + { + "epoch": 0.31535428062587456, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.941146969795227, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8652034401893616, + "num_tokens": 94686305.0, + "step": 2479 + }, + { + "epoch": 0.3154814909044651, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8606374263763428, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8512965440750122, + "num_tokens": 94728606.0, + "step": 2480 + }, + { + "epoch": 0.3156087011830556, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8913918733596802, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8566470146179199, + "num_tokens": 94770162.0, + "step": 2481 + }, + { + "epoch": 0.3157359114616461, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8762869834899902, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8605334758758545, + "num_tokens": 94807735.0, + "step": 2482 + }, + { + "epoch": 0.3158631217402366, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.075328826904297, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8479166626930237, + "num_tokens": 94840366.0, + "step": 2483 + }, + { + "epoch": 0.31599033201882715, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.9551833868026733, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8506072163581848, + "num_tokens": 94874293.0, + "step": 2484 + }, + { + "epoch": 0.3161175422974176, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.212643623352051, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8529783487319946, + "num_tokens": 94909347.0, + "step": 2485 + }, + { + "epoch": 0.31624475257600815, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.7526544332504272, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8631216287612915, + "num_tokens": 94955120.0, + "step": 2486 + }, + { + "epoch": 0.3163719628545987, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.9008159637451172, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8687795996665955, + "num_tokens": 94992319.0, + "step": 2487 + }, + { + "epoch": 0.31649917313318915, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.998512864112854, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8654130697250366, + "num_tokens": 95023464.0, + "step": 2488 + }, + { + "epoch": 0.3166263834117797, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.9215925931930542, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8673787117004395, + "num_tokens": 95055963.0, + "step": 2489 + }, + { + "epoch": 0.3167535936903702, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.9918164014816284, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8356187343597412, + "num_tokens": 95089628.0, + "step": 2490 + }, + { + "epoch": 0.3168808039689607, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.7100915908813477, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8671514391899109, + "num_tokens": 95128557.0, + "step": 2491 + }, + { + "epoch": 0.3170080142475512, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.8661572933197021, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8432538509368896, + "num_tokens": 95168324.0, + "step": 2492 + }, + { + "epoch": 0.31713522452614173, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.079005002975464, + "learning_rate": 1e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8351339101791382, + "num_tokens": 95208139.0, + "step": 2493 + }, + { + "epoch": 0.3172624348047322, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.8936394453048706, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.847846508026123, + "num_tokens": 95247337.0, + "step": 2494 + }, + { + "epoch": 0.31738964508332274, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.9459917545318604, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8466960191726685, + "num_tokens": 95289208.0, + "step": 2495 + }, + { + "epoch": 0.31751685536191326, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.7570557594299316, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8526001572608948, + "num_tokens": 95329717.0, + "step": 2496 + }, + { + "epoch": 0.31764406564050374, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 2.059811592102051, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8355040550231934, + "num_tokens": 95371052.0, + "step": 2497 + }, + { + "epoch": 0.31777127591909426, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.7963200807571411, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8733859658241272, + "num_tokens": 95412486.0, + "step": 2498 + }, + { + "epoch": 0.3178984861976848, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.9488439559936523, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8706034421920776, + "num_tokens": 95446167.0, + "step": 2499 + }, + { + "epoch": 0.31802569647627527, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.7997239828109741, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8486503958702087, + "num_tokens": 95485763.0, + "step": 2500 + }, + { + "epoch": 0.3181529067548658, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.863067626953125, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8431615829467773, + "num_tokens": 95526407.0, + "step": 2501 + }, + { + "epoch": 0.3182801170334563, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 2.3929200172424316, + "learning_rate": 1e-06, + "loss": 0.562, + "mean_token_accuracy": 0.8224390149116516, + "num_tokens": 95567479.0, + "step": 2502 + }, + { + "epoch": 0.3184073273120468, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.7913060188293457, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8601655960083008, + "num_tokens": 95606588.0, + "step": 2503 + }, + { + "epoch": 0.3185345375906373, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.8187938928604126, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8582727909088135, + "num_tokens": 95653293.0, + "step": 2504 + }, + { + "epoch": 0.31866174786922785, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.9313663244247437, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8458887338638306, + "num_tokens": 95692424.0, + "step": 2505 + }, + { + "epoch": 0.3187889581478183, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.8717116117477417, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8628057837486267, + "num_tokens": 95733679.0, + "step": 2506 + }, + { + "epoch": 0.31891616842640885, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.9100621938705444, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8496569395065308, + "num_tokens": 95768211.0, + "step": 2507 + }, + { + "epoch": 0.3190433787049994, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.836658000946045, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8525316715240479, + "num_tokens": 95805974.0, + "step": 2508 + }, + { + "epoch": 0.31917058898358985, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.988834023475647, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8524566888809204, + "num_tokens": 95845463.0, + "step": 2509 + }, + { + "epoch": 0.3192977992621804, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.8493194580078125, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8489466905593872, + "num_tokens": 95888148.0, + "step": 2510 + }, + { + "epoch": 0.3194250095407709, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.8738411664962769, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8756577372550964, + "num_tokens": 95925432.0, + "step": 2511 + }, + { + "epoch": 0.3195522198193614, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.8023830652236938, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8598828911781311, + "num_tokens": 95970695.0, + "step": 2512 + }, + { + "epoch": 0.3196794300979519, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.9322789907455444, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8547373414039612, + "num_tokens": 96006093.0, + "step": 2513 + }, + { + "epoch": 0.31980664037654244, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.7965283393859863, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8541179299354553, + "num_tokens": 96045945.0, + "step": 2514 + }, + { + "epoch": 0.3199338506551329, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 2.3547394275665283, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8714377880096436, + "num_tokens": 96076748.0, + "step": 2515 + }, + { + "epoch": 0.32006106093372344, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.8918193578720093, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8557617664337158, + "num_tokens": 96116964.0, + "step": 2516 + }, + { + "epoch": 0.32018827121231397, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.017134428024292, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8565284609794617, + "num_tokens": 96152073.0, + "step": 2517 + }, + { + "epoch": 0.32031548149090444, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.9437363147735596, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.852615475654602, + "num_tokens": 96191608.0, + "step": 2518 + }, + { + "epoch": 0.32044269176949497, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.7367960214614868, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8724942207336426, + "num_tokens": 96228124.0, + "step": 2519 + }, + { + "epoch": 0.3205699020480855, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.8478245735168457, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.865379273891449, + "num_tokens": 96265870.0, + "step": 2520 + }, + { + "epoch": 0.32069711232667597, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.7748252153396606, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8651305437088013, + "num_tokens": 96309623.0, + "step": 2521 + }, + { + "epoch": 0.3208243226052665, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.022920608520508, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8496915102005005, + "num_tokens": 96344036.0, + "step": 2522 + }, + { + "epoch": 0.320951532883857, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.0378756523132324, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8665571212768555, + "num_tokens": 96377627.0, + "step": 2523 + }, + { + "epoch": 0.3210787431624475, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.980864405632019, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8559821844100952, + "num_tokens": 96413568.0, + "step": 2524 + }, + { + "epoch": 0.32120595344103803, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 2.127504587173462, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.853279709815979, + "num_tokens": 96454682.0, + "step": 2525 + }, + { + "epoch": 0.32133316371962856, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.0169689655303955, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8444675207138062, + "num_tokens": 96490456.0, + "step": 2526 + }, + { + "epoch": 0.32146037399821903, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.9483346939086914, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8478976488113403, + "num_tokens": 96526891.0, + "step": 2527 + }, + { + "epoch": 0.32158758427680956, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.6430275440216064, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8491867184638977, + "num_tokens": 96575226.0, + "step": 2528 + }, + { + "epoch": 0.3217147945554001, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.0952394008636475, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8452669978141785, + "num_tokens": 96606383.0, + "step": 2529 + }, + { + "epoch": 0.3218420048339906, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.9857484102249146, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8567030429840088, + "num_tokens": 96639826.0, + "step": 2530 + }, + { + "epoch": 0.3219692151125811, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.8635252714157104, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8588970303535461, + "num_tokens": 96679151.0, + "step": 2531 + }, + { + "epoch": 0.3220964253911716, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.8888494968414307, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8617451190948486, + "num_tokens": 96719365.0, + "step": 2532 + }, + { + "epoch": 0.32222363566976214, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9598891735076904, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.851622462272644, + "num_tokens": 96758318.0, + "step": 2533 + }, + { + "epoch": 0.3223508459483526, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 2.1138298511505127, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8510342836380005, + "num_tokens": 96791159.0, + "step": 2534 + }, + { + "epoch": 0.32247805622694314, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.918471336364746, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8579592704772949, + "num_tokens": 96825352.0, + "step": 2535 + }, + { + "epoch": 0.32260526650553367, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9617340564727783, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8521571159362793, + "num_tokens": 96859700.0, + "step": 2536 + }, + { + "epoch": 0.32273247678412414, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.974771499633789, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8530564308166504, + "num_tokens": 96897077.0, + "step": 2537 + }, + { + "epoch": 0.3228596870627147, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9821617603302002, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8504549264907837, + "num_tokens": 96936936.0, + "step": 2538 + }, + { + "epoch": 0.3229868973413052, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8520227670669556, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8594139814376831, + "num_tokens": 96975702.0, + "step": 2539 + }, + { + "epoch": 0.3231141076198957, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8469843864440918, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.853945791721344, + "num_tokens": 97010585.0, + "step": 2540 + }, + { + "epoch": 0.3232413178984862, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9670188426971436, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8787240982055664, + "num_tokens": 97047210.0, + "step": 2541 + }, + { + "epoch": 0.32336852817707673, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9198274612426758, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.85024094581604, + "num_tokens": 97084835.0, + "step": 2542 + }, + { + "epoch": 0.3234957384556672, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 16.48141860961914, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8521996736526489, + "num_tokens": 97125829.0, + "step": 2543 + }, + { + "epoch": 0.32362294873425773, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.171767234802246, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8480043411254883, + "num_tokens": 97162874.0, + "step": 2544 + }, + { + "epoch": 0.32375015901284826, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.1187758445739746, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8571516871452332, + "num_tokens": 97197559.0, + "step": 2545 + }, + { + "epoch": 0.32387736929143873, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.0848300457000732, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8424369096755981, + "num_tokens": 97227028.0, + "step": 2546 + }, + { + "epoch": 0.32400457957002926, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8077061176300049, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8503034114837646, + "num_tokens": 97270209.0, + "step": 2547 + }, + { + "epoch": 0.3241317898486198, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8557038307189941, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.857266902923584, + "num_tokens": 97309176.0, + "step": 2548 + }, + { + "epoch": 0.32425900012721026, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.0238943099975586, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8360452651977539, + "num_tokens": 97340464.0, + "step": 2549 + }, + { + "epoch": 0.3243862104058008, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8227839469909668, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8590477108955383, + "num_tokens": 97378435.0, + "step": 2550 + }, + { + "epoch": 0.3245134206843913, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.046628952026367, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8497695326805115, + "num_tokens": 97417886.0, + "step": 2551 + }, + { + "epoch": 0.3246406309629818, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.0340046882629395, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8575645685195923, + "num_tokens": 97450552.0, + "step": 2552 + }, + { + "epoch": 0.3247678412415723, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8532620668411255, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8611261248588562, + "num_tokens": 97488586.0, + "step": 2553 + }, + { + "epoch": 0.32489505152016285, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8137390613555908, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8498332500457764, + "num_tokens": 97528893.0, + "step": 2554 + }, + { + "epoch": 0.3250222617987533, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.927225112915039, + "learning_rate": 1e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8307862877845764, + "num_tokens": 97568654.0, + "step": 2555 + }, + { + "epoch": 0.32514947207734385, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 3.179544687271118, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8586341142654419, + "num_tokens": 97607068.0, + "step": 2556 + }, + { + "epoch": 0.3252766823559344, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.897634744644165, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8752196431159973, + "num_tokens": 97647505.0, + "step": 2557 + }, + { + "epoch": 0.32540389263452485, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8979698419570923, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8489068746566772, + "num_tokens": 97690505.0, + "step": 2558 + }, + { + "epoch": 0.3255311029131154, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.1660029888153076, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8421037197113037, + "num_tokens": 97723955.0, + "step": 2559 + }, + { + "epoch": 0.3256583131917059, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8835225105285645, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.847928524017334, + "num_tokens": 97763822.0, + "step": 2560 + }, + { + "epoch": 0.3257855234702964, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.842604637145996, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8504627346992493, + "num_tokens": 97803827.0, + "step": 2561 + }, + { + "epoch": 0.3259127337488869, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 3.036561965942383, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8534680008888245, + "num_tokens": 97841056.0, + "step": 2562 + }, + { + "epoch": 0.32603994402747744, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8883097171783447, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8707704544067383, + "num_tokens": 97881714.0, + "step": 2563 + }, + { + "epoch": 0.3261671543060679, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.3386306762695312, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8520457744598389, + "num_tokens": 97925797.0, + "step": 2564 + }, + { + "epoch": 0.32629436458465844, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8292814493179321, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8597880601882935, + "num_tokens": 97963638.0, + "step": 2565 + }, + { + "epoch": 0.32642157486324896, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.941339135169983, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8486298322677612, + "num_tokens": 98002043.0, + "step": 2566 + }, + { + "epoch": 0.32654878514183944, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.0451157093048096, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8572633266448975, + "num_tokens": 98041031.0, + "step": 2567 + }, + { + "epoch": 0.32667599542042997, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 2.021838903427124, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8626545667648315, + "num_tokens": 98075510.0, + "step": 2568 + }, + { + "epoch": 0.3268032056990205, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8001457452774048, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8602904081344604, + "num_tokens": 98114162.0, + "step": 2569 + }, + { + "epoch": 0.32693041597761097, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9926230907440186, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8501537442207336, + "num_tokens": 98148722.0, + "step": 2570 + }, + { + "epoch": 0.3270576262562015, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 3.2102558612823486, + "learning_rate": 1e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8342169523239136, + "num_tokens": 98188255.0, + "step": 2571 + }, + { + "epoch": 0.327184836534792, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 2.2135298252105713, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8649408221244812, + "num_tokens": 98228597.0, + "step": 2572 + }, + { + "epoch": 0.3273120468133825, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8607187271118164, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8455449938774109, + "num_tokens": 98273405.0, + "step": 2573 + }, + { + "epoch": 0.327439257091973, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8627803325653076, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8397804498672485, + "num_tokens": 98315091.0, + "step": 2574 + }, + { + "epoch": 0.32756646737056355, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9462958574295044, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8551499843597412, + "num_tokens": 98356967.0, + "step": 2575 + }, + { + "epoch": 0.327693677649154, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 2.3970258235931396, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8452107906341553, + "num_tokens": 98394455.0, + "step": 2576 + }, + { + "epoch": 0.32782088792774455, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 2.003774642944336, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8621765375137329, + "num_tokens": 98430455.0, + "step": 2577 + }, + { + "epoch": 0.3279480982063351, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8140983581542969, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8402268886566162, + "num_tokens": 98472218.0, + "step": 2578 + }, + { + "epoch": 0.3280753084849256, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.956893801689148, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8619526624679565, + "num_tokens": 98509540.0, + "step": 2579 + }, + { + "epoch": 0.3282025187635161, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.833411455154419, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8534654378890991, + "num_tokens": 98544761.0, + "step": 2580 + }, + { + "epoch": 0.3283297290421066, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.875689148902893, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8675628900527954, + "num_tokens": 98580522.0, + "step": 2581 + }, + { + "epoch": 0.32845693932069714, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.89973783493042, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8628524541854858, + "num_tokens": 98612988.0, + "step": 2582 + }, + { + "epoch": 0.3285841495992876, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8269885778427124, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8429705500602722, + "num_tokens": 98654213.0, + "step": 2583 + }, + { + "epoch": 0.32871135987787814, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8870805501937866, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.857168436050415, + "num_tokens": 98697168.0, + "step": 2584 + }, + { + "epoch": 0.32883857015646867, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.986659049987793, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8595777750015259, + "num_tokens": 98735664.0, + "step": 2585 + }, + { + "epoch": 0.32896578043505914, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.893697738647461, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8504752516746521, + "num_tokens": 98777257.0, + "step": 2586 + }, + { + "epoch": 0.32909299071364967, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9780290126800537, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8587779998779297, + "num_tokens": 98812443.0, + "step": 2587 + }, + { + "epoch": 0.3292202009922402, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9153963327407837, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8624737858772278, + "num_tokens": 98848231.0, + "step": 2588 + }, + { + "epoch": 0.32934741127083067, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 2.071206569671631, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8575907945632935, + "num_tokens": 98886487.0, + "step": 2589 + }, + { + "epoch": 0.3294746215494212, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.8110967874526978, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8438326120376587, + "num_tokens": 98924750.0, + "step": 2590 + }, + { + "epoch": 0.3296018318280117, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.9669544696807861, + "learning_rate": 1e-06, + "loss": 0.5397, + "mean_token_accuracy": 0.8331011533737183, + "num_tokens": 98964151.0, + "step": 2591 + }, + { + "epoch": 0.3297290421066022, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.9516007900238037, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.865700364112854, + "num_tokens": 99000277.0, + "step": 2592 + }, + { + "epoch": 0.3298562523851927, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.770533800125122, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8443722724914551, + "num_tokens": 99042335.0, + "step": 2593 + }, + { + "epoch": 0.32998346266378326, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.7763657569885254, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8538737297058105, + "num_tokens": 99082964.0, + "step": 2594 + }, + { + "epoch": 0.33011067294237373, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.718309998512268, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8774420022964478, + "num_tokens": 99123440.0, + "step": 2595 + }, + { + "epoch": 0.33023788322096426, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.7563443183898926, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8530663251876831, + "num_tokens": 99167047.0, + "step": 2596 + }, + { + "epoch": 0.3303650934995548, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.9842277765274048, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8507063388824463, + "num_tokens": 99202594.0, + "step": 2597 + }, + { + "epoch": 0.33049230377814526, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8035714626312256, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.861544132232666, + "num_tokens": 99244732.0, + "step": 2598 + }, + { + "epoch": 0.3306195140567358, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.82918381690979, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8650799989700317, + "num_tokens": 99287161.0, + "step": 2599 + }, + { + "epoch": 0.3307467243353263, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.9795401096343994, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.847924530506134, + "num_tokens": 99322650.0, + "step": 2600 + }, + { + "epoch": 0.3308739346139168, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.9806283712387085, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8553228378295898, + "num_tokens": 99358529.0, + "step": 2601 + }, + { + "epoch": 0.3310011448925073, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.063433885574341, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8637573719024658, + "num_tokens": 99394010.0, + "step": 2602 + }, + { + "epoch": 0.33112835517109784, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9725450277328491, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8580975532531738, + "num_tokens": 99434764.0, + "step": 2603 + }, + { + "epoch": 0.3312555654496883, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.7529261112213135, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8570463061332703, + "num_tokens": 99481322.0, + "step": 2604 + }, + { + "epoch": 0.33138277572827884, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 1.975442886352539, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8443167805671692, + "num_tokens": 99517719.0, + "step": 2605 + }, + { + "epoch": 0.3315099860068694, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.8910844326019287, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8581673502922058, + "num_tokens": 99553662.0, + "step": 2606 + }, + { + "epoch": 0.33163719628545985, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.038588523864746, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8556044101715088, + "num_tokens": 99589195.0, + "step": 2607 + }, + { + "epoch": 0.3317644065640504, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9581912755966187, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8567913770675659, + "num_tokens": 99626092.0, + "step": 2608 + }, + { + "epoch": 0.3318916168426409, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9267736673355103, + "learning_rate": 1e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8301280736923218, + "num_tokens": 99667087.0, + "step": 2609 + }, + { + "epoch": 0.3320188271212314, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9227867126464844, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8498289585113525, + "num_tokens": 99705929.0, + "step": 2610 + }, + { + "epoch": 0.3321460373998219, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9668264389038086, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8515253663063049, + "num_tokens": 99742044.0, + "step": 2611 + }, + { + "epoch": 0.33227324767841243, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.122370719909668, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8590455055236816, + "num_tokens": 99772621.0, + "step": 2612 + }, + { + "epoch": 0.3324004579570029, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.8036330938339233, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8616708517074585, + "num_tokens": 99814405.0, + "step": 2613 + }, + { + "epoch": 0.33252766823559343, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.014686346054077, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8661630749702454, + "num_tokens": 99852197.0, + "step": 2614 + }, + { + "epoch": 0.33265487851418396, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9815341234207153, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.849676251411438, + "num_tokens": 99894254.0, + "step": 2615 + }, + { + "epoch": 0.33278208879277443, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.8282619714736938, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8503551483154297, + "num_tokens": 99936230.0, + "step": 2616 + }, + { + "epoch": 0.33290929907136496, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.8773350715637207, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.855962872505188, + "num_tokens": 99974964.0, + "step": 2617 + }, + { + "epoch": 0.3330365093499555, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.8280186653137207, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8466413617134094, + "num_tokens": 100018750.0, + "step": 2618 + }, + { + "epoch": 0.33316371962854596, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.031148672103882, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8492578864097595, + "num_tokens": 100054064.0, + "step": 2619 + }, + { + "epoch": 0.3332909299071365, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.931153416633606, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8605183362960815, + "num_tokens": 100092549.0, + "step": 2620 + }, + { + "epoch": 0.333418140185727, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.140070676803589, + "learning_rate": 1e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.828864336013794, + "num_tokens": 100123973.0, + "step": 2621 + }, + { + "epoch": 0.3335453504643175, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9968383312225342, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8541038632392883, + "num_tokens": 100162142.0, + "step": 2622 + }, + { + "epoch": 0.333672560742908, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9705922603607178, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8493716716766357, + "num_tokens": 100199352.0, + "step": 2623 + }, + { + "epoch": 0.33379977102149855, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.1417384147644043, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8391921520233154, + "num_tokens": 100237446.0, + "step": 2624 + }, + { + "epoch": 0.333926981300089, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9461023807525635, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8595446348190308, + "num_tokens": 100272986.0, + "step": 2625 + }, + { + "epoch": 0.33405419157867955, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.1192283630371094, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8452451229095459, + "num_tokens": 100304998.0, + "step": 2626 + }, + { + "epoch": 0.3341814018572701, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.0709545612335205, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8503943681716919, + "num_tokens": 100338473.0, + "step": 2627 + }, + { + "epoch": 0.33430861213586055, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.0764167308807373, + "learning_rate": 1e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8385831117630005, + "num_tokens": 100372644.0, + "step": 2628 + }, + { + "epoch": 0.3344358224144511, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.092522144317627, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8342633247375488, + "num_tokens": 100413248.0, + "step": 2629 + }, + { + "epoch": 0.3345630326930416, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.23593807220459, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8488770723342896, + "num_tokens": 100448022.0, + "step": 2630 + }, + { + "epoch": 0.33469024297163213, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.4483377933502197, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.842171311378479, + "num_tokens": 100493222.0, + "step": 2631 + }, + { + "epoch": 0.3348174532502226, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.050492525100708, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8604285717010498, + "num_tokens": 100534736.0, + "step": 2632 + }, + { + "epoch": 0.33494466352881314, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9052764177322388, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8624734878540039, + "num_tokens": 100576754.0, + "step": 2633 + }, + { + "epoch": 0.33507187380740366, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.0206401348114014, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8480314016342163, + "num_tokens": 100611407.0, + "step": 2634 + }, + { + "epoch": 0.33519908408599414, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9903017282485962, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8595899343490601, + "num_tokens": 100643531.0, + "step": 2635 + }, + { + "epoch": 0.33532629436458466, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.029906749725342, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8568552136421204, + "num_tokens": 100678612.0, + "step": 2636 + }, + { + "epoch": 0.3354535046431752, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9256035089492798, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8589655756950378, + "num_tokens": 100718873.0, + "step": 2637 + }, + { + "epoch": 0.33558071492176567, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.8822522163391113, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8547226190567017, + "num_tokens": 100759883.0, + "step": 2638 + }, + { + "epoch": 0.3357079252003562, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9609050750732422, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8344632387161255, + "num_tokens": 100800022.0, + "step": 2639 + }, + { + "epoch": 0.3358351354789467, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.1091668605804443, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8399276733398438, + "num_tokens": 100837299.0, + "step": 2640 + }, + { + "epoch": 0.3359623457575372, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.073821544647217, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8513596057891846, + "num_tokens": 100868433.0, + "step": 2641 + }, + { + "epoch": 0.3360895560361277, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.8343162536621094, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8381001949310303, + "num_tokens": 100907768.0, + "step": 2642 + }, + { + "epoch": 0.33621676631471825, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8936405181884766, + "learning_rate": 1e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8310501575469971, + "num_tokens": 100942723.0, + "step": 2643 + }, + { + "epoch": 0.3363439765933087, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.8873463869094849, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.858722448348999, + "num_tokens": 100984997.0, + "step": 2644 + }, + { + "epoch": 0.33647118687189925, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.724213719367981, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8690968751907349, + "num_tokens": 101026290.0, + "step": 2645 + }, + { + "epoch": 0.3365983971504898, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.001560926437378, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8467918634414673, + "num_tokens": 101066766.0, + "step": 2646 + }, + { + "epoch": 0.33672560742908025, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.1895523071289062, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8583502769470215, + "num_tokens": 101100833.0, + "step": 2647 + }, + { + "epoch": 0.3368528177076708, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8426954746246338, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8647016286849976, + "num_tokens": 101145552.0, + "step": 2648 + }, + { + "epoch": 0.3369800279862613, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.8693699836730957, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8561056852340698, + "num_tokens": 101186644.0, + "step": 2649 + }, + { + "epoch": 0.3371072382648518, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.038684606552124, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8519014120101929, + "num_tokens": 101223356.0, + "step": 2650 + }, + { + "epoch": 0.3372344485434423, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.9406650066375732, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.867326021194458, + "num_tokens": 101260527.0, + "step": 2651 + }, + { + "epoch": 0.33736165882203284, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8519703149795532, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8530557751655579, + "num_tokens": 101305403.0, + "step": 2652 + }, + { + "epoch": 0.3374888691006233, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.792718529701233, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8401934504508972, + "num_tokens": 101345858.0, + "step": 2653 + }, + { + "epoch": 0.33761607937921384, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.057281494140625, + "learning_rate": 1e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.8315174579620361, + "num_tokens": 101380321.0, + "step": 2654 + }, + { + "epoch": 0.33774328965780437, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8403149843215942, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8625160455703735, + "num_tokens": 101419120.0, + "step": 2655 + }, + { + "epoch": 0.33787049993639484, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.9529109001159668, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8553792834281921, + "num_tokens": 101453272.0, + "step": 2656 + }, + { + "epoch": 0.33799771021498537, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.126760482788086, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8420270085334778, + "num_tokens": 101486879.0, + "step": 2657 + }, + { + "epoch": 0.3381249204935759, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.9650423526763916, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8468759059906006, + "num_tokens": 101523691.0, + "step": 2658 + }, + { + "epoch": 0.33825213077216637, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.055072784423828, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.854806125164032, + "num_tokens": 101555279.0, + "step": 2659 + }, + { + "epoch": 0.3383793410507569, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9899805784225464, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8699360489845276, + "num_tokens": 101591780.0, + "step": 2660 + }, + { + "epoch": 0.3385065513293474, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8639050722122192, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8700069189071655, + "num_tokens": 101631041.0, + "step": 2661 + }, + { + "epoch": 0.3386337616079379, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9447354078292847, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8445200324058533, + "num_tokens": 101666315.0, + "step": 2662 + }, + { + "epoch": 0.33876097188652843, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.7995001077651978, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8666932582855225, + "num_tokens": 101704046.0, + "step": 2663 + }, + { + "epoch": 0.33888818216511896, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8171812295913696, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8657783269882202, + "num_tokens": 101743076.0, + "step": 2664 + }, + { + "epoch": 0.33901539244370943, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8439733982086182, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8668289184570312, + "num_tokens": 101784006.0, + "step": 2665 + }, + { + "epoch": 0.33914260272229996, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.881922721862793, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8612792491912842, + "num_tokens": 101821845.0, + "step": 2666 + }, + { + "epoch": 0.3392698130008905, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.7951478958129883, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8599957823753357, + "num_tokens": 101861121.0, + "step": 2667 + }, + { + "epoch": 0.33939702327948096, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.1356518268585205, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8508075475692749, + "num_tokens": 101894330.0, + "step": 2668 + }, + { + "epoch": 0.3395242335580715, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8947573900222778, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8409688472747803, + "num_tokens": 101934835.0, + "step": 2669 + }, + { + "epoch": 0.339651443836662, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.015363931655884, + "learning_rate": 1e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8379305601119995, + "num_tokens": 101970943.0, + "step": 2670 + }, + { + "epoch": 0.3397786541152525, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.115295648574829, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8424049615859985, + "num_tokens": 102001211.0, + "step": 2671 + }, + { + "epoch": 0.339905864393843, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8900010585784912, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.861737847328186, + "num_tokens": 102039399.0, + "step": 2672 + }, + { + "epoch": 0.34003307467243354, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.114126205444336, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8443437814712524, + "num_tokens": 102070635.0, + "step": 2673 + }, + { + "epoch": 0.340160284951024, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9539610147476196, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8588963747024536, + "num_tokens": 102104590.0, + "step": 2674 + }, + { + "epoch": 0.34028749522961454, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.754056692123413, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8598196506500244, + "num_tokens": 102149267.0, + "step": 2675 + }, + { + "epoch": 0.3404147055082051, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9711711406707764, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8387891054153442, + "num_tokens": 102188089.0, + "step": 2676 + }, + { + "epoch": 0.34054191578679555, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.883248209953308, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8616968393325806, + "num_tokens": 102224335.0, + "step": 2677 + }, + { + "epoch": 0.3406691260653861, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8558353185653687, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8580163717269897, + "num_tokens": 102263421.0, + "step": 2678 + }, + { + "epoch": 0.3407963363439766, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.921239972114563, + "learning_rate": 1e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8378005027770996, + "num_tokens": 102301133.0, + "step": 2679 + }, + { + "epoch": 0.34092354662256713, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9188917875289917, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8580791354179382, + "num_tokens": 102338516.0, + "step": 2680 + }, + { + "epoch": 0.3410507569011576, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9111640453338623, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8438683152198792, + "num_tokens": 102376010.0, + "step": 2681 + }, + { + "epoch": 0.34117796717974813, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.7901068925857544, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8759637475013733, + "num_tokens": 102417080.0, + "step": 2682 + }, + { + "epoch": 0.34130517745833866, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.823672890663147, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8565307855606079, + "num_tokens": 102458042.0, + "step": 2683 + }, + { + "epoch": 0.34143238773692913, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.7497329711914062, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.863351583480835, + "num_tokens": 102497152.0, + "step": 2684 + }, + { + "epoch": 0.34155959801551966, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9409703016281128, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8538929224014282, + "num_tokens": 102531486.0, + "step": 2685 + }, + { + "epoch": 0.3416868082941102, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8145787715911865, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8538184762001038, + "num_tokens": 102571552.0, + "step": 2686 + }, + { + "epoch": 0.34181401857270066, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9823178052902222, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8599414825439453, + "num_tokens": 102607338.0, + "step": 2687 + }, + { + "epoch": 0.3419412288512912, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.7007498741149902, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8553494215011597, + "num_tokens": 102654445.0, + "step": 2688 + }, + { + "epoch": 0.3420684391298817, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9255101680755615, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.863858699798584, + "num_tokens": 102695265.0, + "step": 2689 + }, + { + "epoch": 0.3421956494084722, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.889277696609497, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8599970936775208, + "num_tokens": 102728621.0, + "step": 2690 + }, + { + "epoch": 0.3423228596870627, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9541922807693481, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8489843606948853, + "num_tokens": 102766378.0, + "step": 2691 + }, + { + "epoch": 0.34245006996565325, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8337109088897705, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.853064775466919, + "num_tokens": 102806331.0, + "step": 2692 + }, + { + "epoch": 0.3425772802442437, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 16.597990036010742, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8579646348953247, + "num_tokens": 102845314.0, + "step": 2693 + }, + { + "epoch": 0.34270449052283425, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.31329345703125, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8528941869735718, + "num_tokens": 102888614.0, + "step": 2694 + }, + { + "epoch": 0.3428317008014248, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.078921318054199, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8496428728103638, + "num_tokens": 102925988.0, + "step": 2695 + }, + { + "epoch": 0.34295891108001525, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.864730715751648, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8628681302070618, + "num_tokens": 102963150.0, + "step": 2696 + }, + { + "epoch": 0.3430861213586058, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9297651052474976, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8634092807769775, + "num_tokens": 102999164.0, + "step": 2697 + }, + { + "epoch": 0.3432133316371963, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.0185251235961914, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.86378014087677, + "num_tokens": 103034337.0, + "step": 2698 + }, + { + "epoch": 0.3433405419157868, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.9148831367492676, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8613032698631287, + "num_tokens": 103075062.0, + "step": 2699 + }, + { + "epoch": 0.3434677521943773, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8526968955993652, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8511519432067871, + "num_tokens": 103113209.0, + "step": 2700 + }, + { + "epoch": 0.34359496247296784, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.4797134399414062, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8590881824493408, + "num_tokens": 103153117.0, + "step": 2701 + }, + { + "epoch": 0.3437221727515583, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.342559576034546, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.853339672088623, + "num_tokens": 103192234.0, + "step": 2702 + }, + { + "epoch": 0.34384938303014884, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8918051719665527, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8526376485824585, + "num_tokens": 103224958.0, + "step": 2703 + }, + { + "epoch": 0.34397659330873936, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9613440036773682, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8571281433105469, + "num_tokens": 103259679.0, + "step": 2704 + }, + { + "epoch": 0.34410380358732984, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8892643451690674, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8476290702819824, + "num_tokens": 103299987.0, + "step": 2705 + }, + { + "epoch": 0.34423101386592037, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9028955698013306, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8421515822410583, + "num_tokens": 103344457.0, + "step": 2706 + }, + { + "epoch": 0.3443582241445109, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.159829616546631, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8596908450126648, + "num_tokens": 103385173.0, + "step": 2707 + }, + { + "epoch": 0.34448543442310137, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9828304052352905, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8395859003067017, + "num_tokens": 103425972.0, + "step": 2708 + }, + { + "epoch": 0.3446126447016919, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.417388677597046, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8513780832290649, + "num_tokens": 103466819.0, + "step": 2709 + }, + { + "epoch": 0.3447398549802824, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.835601806640625, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.843505859375, + "num_tokens": 103509660.0, + "step": 2710 + }, + { + "epoch": 0.3448670652588729, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9831799268722534, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8493771553039551, + "num_tokens": 103544557.0, + "step": 2711 + }, + { + "epoch": 0.3449942755374634, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.1081764698028564, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8647181987762451, + "num_tokens": 103585132.0, + "step": 2712 + }, + { + "epoch": 0.34512148581605395, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.7870761156082153, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.864011287689209, + "num_tokens": 103626353.0, + "step": 2713 + }, + { + "epoch": 0.3452486960946444, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.0398128032684326, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8578451871871948, + "num_tokens": 103659035.0, + "step": 2714 + }, + { + "epoch": 0.34537590637323495, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8072923421859741, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8623454570770264, + "num_tokens": 103699497.0, + "step": 2715 + }, + { + "epoch": 0.3455031166518255, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.982416033744812, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8436106443405151, + "num_tokens": 103733685.0, + "step": 2716 + }, + { + "epoch": 0.34563032693041595, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.2563834190368652, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8383740782737732, + "num_tokens": 103769399.0, + "step": 2717 + }, + { + "epoch": 0.3457575372090065, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.895853042602539, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8624704480171204, + "num_tokens": 103809078.0, + "step": 2718 + }, + { + "epoch": 0.345884747487597, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.7882691621780396, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8719205260276794, + "num_tokens": 103846764.0, + "step": 2719 + }, + { + "epoch": 0.3460119577661875, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.0188915729522705, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8641760349273682, + "num_tokens": 103883089.0, + "step": 2720 + }, + { + "epoch": 0.346139168044778, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8946490287780762, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8735580444335938, + "num_tokens": 103915775.0, + "step": 2721 + }, + { + "epoch": 0.34626637832336854, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8143421411514282, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8509127497673035, + "num_tokens": 103953703.0, + "step": 2722 + }, + { + "epoch": 0.346393588601959, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 2.0381338596343994, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8443199396133423, + "num_tokens": 103990775.0, + "step": 2723 + }, + { + "epoch": 0.34652079888054954, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9051449298858643, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8595925569534302, + "num_tokens": 104025927.0, + "step": 2724 + }, + { + "epoch": 0.34664800915914007, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8916665315628052, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.846347451210022, + "num_tokens": 104065860.0, + "step": 2725 + }, + { + "epoch": 0.34677521943773054, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.9874346256256104, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8571701049804688, + "num_tokens": 104102117.0, + "step": 2726 + }, + { + "epoch": 0.34690242971632107, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.8749756813049316, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8526670336723328, + "num_tokens": 104140896.0, + "step": 2727 + }, + { + "epoch": 0.3470296399949116, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.0755677223205566, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8534500598907471, + "num_tokens": 104181355.0, + "step": 2728 + }, + { + "epoch": 0.3471568502735021, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.116297721862793, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8562842607498169, + "num_tokens": 104218564.0, + "step": 2729 + }, + { + "epoch": 0.3472840605520926, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.784420371055603, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8605409860610962, + "num_tokens": 104256553.0, + "step": 2730 + }, + { + "epoch": 0.3474112708306831, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9388360977172852, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8683534264564514, + "num_tokens": 104291150.0, + "step": 2731 + }, + { + "epoch": 0.34753848110927366, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8764315843582153, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.848721981048584, + "num_tokens": 104327977.0, + "step": 2732 + }, + { + "epoch": 0.34766569138786413, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.1840922832489014, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8530957698822021, + "num_tokens": 104365167.0, + "step": 2733 + }, + { + "epoch": 0.34779290166645466, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.0330772399902344, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8602199554443359, + "num_tokens": 104400304.0, + "step": 2734 + }, + { + "epoch": 0.3479201119450452, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9584650993347168, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.840955376625061, + "num_tokens": 104437974.0, + "step": 2735 + }, + { + "epoch": 0.34804732222363566, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9407846927642822, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8594173789024353, + "num_tokens": 104474474.0, + "step": 2736 + }, + { + "epoch": 0.3481745325022262, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.1638071537017822, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8409756422042847, + "num_tokens": 104511004.0, + "step": 2737 + }, + { + "epoch": 0.3483017427808167, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9702565670013428, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8495945930480957, + "num_tokens": 104551889.0, + "step": 2738 + }, + { + "epoch": 0.3484289530594072, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.118575096130371, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8407245874404907, + "num_tokens": 104591962.0, + "step": 2739 + }, + { + "epoch": 0.3485561633379977, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.0174245834350586, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.853803277015686, + "num_tokens": 104636201.0, + "step": 2740 + }, + { + "epoch": 0.34868337361658824, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.8578472137451172, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.854932427406311, + "num_tokens": 104673448.0, + "step": 2741 + }, + { + "epoch": 0.3488105838951787, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.9197527170181274, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.863289475440979, + "num_tokens": 104708879.0, + "step": 2742 + }, + { + "epoch": 0.34893779417376924, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.2045798301696777, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8658407926559448, + "num_tokens": 104747073.0, + "step": 2743 + }, + { + "epoch": 0.3490650044523598, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.8864696025848389, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8540036678314209, + "num_tokens": 104786950.0, + "step": 2744 + }, + { + "epoch": 0.34919221473095025, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9090025424957275, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8610278367996216, + "num_tokens": 104828306.0, + "step": 2745 + }, + { + "epoch": 0.3493194250095408, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 16.622751235961914, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8478966355323792, + "num_tokens": 104867030.0, + "step": 2746 + }, + { + "epoch": 0.3494466352881313, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.145639657974243, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8596333265304565, + "num_tokens": 104909182.0, + "step": 2747 + }, + { + "epoch": 0.3495738455667218, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.914409875869751, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.851585865020752, + "num_tokens": 104950648.0, + "step": 2748 + }, + { + "epoch": 0.3497010558453123, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.8178008794784546, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8606806993484497, + "num_tokens": 104994780.0, + "step": 2749 + }, + { + "epoch": 0.34982826612390283, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.8648079633712769, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8629209995269775, + "num_tokens": 105029469.0, + "step": 2750 + }, + { + "epoch": 0.3499554764024933, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.0320396423339844, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.849799633026123, + "num_tokens": 105066797.0, + "step": 2751 + }, + { + "epoch": 0.35008268668108383, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9691740274429321, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8718663454055786, + "num_tokens": 105100869.0, + "step": 2752 + }, + { + "epoch": 0.35020989695967436, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9118849039077759, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8681749701499939, + "num_tokens": 105135409.0, + "step": 2753 + }, + { + "epoch": 0.35033710723826483, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.897242546081543, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8627996444702148, + "num_tokens": 105172839.0, + "step": 2754 + }, + { + "epoch": 0.35046431751685536, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8181413412094116, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8619702458381653, + "num_tokens": 105216656.0, + "step": 2755 + }, + { + "epoch": 0.3505915277954459, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8737640380859375, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8490736484527588, + "num_tokens": 105253740.0, + "step": 2756 + }, + { + "epoch": 0.35071873807403636, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.119257688522339, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8593895435333252, + "num_tokens": 105290073.0, + "step": 2757 + }, + { + "epoch": 0.3508459483526269, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.8470125198364258, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8641737103462219, + "num_tokens": 105325933.0, + "step": 2758 + }, + { + "epoch": 0.3509731586312174, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9553017616271973, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8600276112556458, + "num_tokens": 105362962.0, + "step": 2759 + }, + { + "epoch": 0.3511003689098079, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.7837601900100708, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8568857908248901, + "num_tokens": 105410074.0, + "step": 2760 + }, + { + "epoch": 0.3512275791883984, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.3832480907440186, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.840994119644165, + "num_tokens": 105450264.0, + "step": 2761 + }, + { + "epoch": 0.35135478946698895, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9559392929077148, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8464632034301758, + "num_tokens": 105486566.0, + "step": 2762 + }, + { + "epoch": 0.3514819997455794, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.784705638885498, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8601047992706299, + "num_tokens": 105527185.0, + "step": 2763 + }, + { + "epoch": 0.35160921002416995, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.035369873046875, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8544027805328369, + "num_tokens": 105557401.0, + "step": 2764 + }, + { + "epoch": 0.3517364203027605, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.9490236043930054, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8639612197875977, + "num_tokens": 105586877.0, + "step": 2765 + }, + { + "epoch": 0.35186363058135095, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9731274843215942, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8620582222938538, + "num_tokens": 105626988.0, + "step": 2766 + }, + { + "epoch": 0.3519908408599415, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9093505144119263, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8486807942390442, + "num_tokens": 105665796.0, + "step": 2767 + }, + { + "epoch": 0.352118051138532, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 1.683864951133728, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8492541313171387, + "num_tokens": 105708892.0, + "step": 2768 + }, + { + "epoch": 0.3522452614171225, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.8134026527404785, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8500967025756836, + "num_tokens": 105748221.0, + "step": 2769 + }, + { + "epoch": 0.352372471695713, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9371049404144287, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8539243340492249, + "num_tokens": 105784568.0, + "step": 2770 + }, + { + "epoch": 0.35249968197430354, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 3.9586527347564697, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8437792062759399, + "num_tokens": 105827223.0, + "step": 2771 + }, + { + "epoch": 0.352626892252894, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.848890781402588, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8740160465240479, + "num_tokens": 105871014.0, + "step": 2772 + }, + { + "epoch": 0.35275410253148454, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9640024900436401, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.852566123008728, + "num_tokens": 105908995.0, + "step": 2773 + }, + { + "epoch": 0.35288131281007507, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.847880482673645, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8506907820701599, + "num_tokens": 105948243.0, + "step": 2774 + }, + { + "epoch": 0.35300852308866554, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9071425199508667, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8586518168449402, + "num_tokens": 105983997.0, + "step": 2775 + }, + { + "epoch": 0.35313573336725607, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9872153997421265, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8434828519821167, + "num_tokens": 106020336.0, + "step": 2776 + }, + { + "epoch": 0.3532629436458466, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.093599557876587, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8566397428512573, + "num_tokens": 106053300.0, + "step": 2777 + }, + { + "epoch": 0.35339015392443707, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.9500676393508911, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8707749843597412, + "num_tokens": 106091790.0, + "step": 2778 + }, + { + "epoch": 0.3535173642030276, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.2369415760040283, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.858852744102478, + "num_tokens": 106123768.0, + "step": 2779 + }, + { + "epoch": 0.3536445744816181, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9797275066375732, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8625999093055725, + "num_tokens": 106158219.0, + "step": 2780 + }, + { + "epoch": 0.35377178476020865, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.074249744415283, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8510807156562805, + "num_tokens": 106195795.0, + "step": 2781 + }, + { + "epoch": 0.3538989950387991, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9473642110824585, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8560774326324463, + "num_tokens": 106234973.0, + "step": 2782 + }, + { + "epoch": 0.35402620531738965, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.1059494018554688, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8421212434768677, + "num_tokens": 106267297.0, + "step": 2783 + }, + { + "epoch": 0.3541534155959802, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.8364589214324951, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.863392174243927, + "num_tokens": 106309070.0, + "step": 2784 + }, + { + "epoch": 0.35428062587457065, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.036717176437378, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8601195216178894, + "num_tokens": 106345784.0, + "step": 2785 + }, + { + "epoch": 0.3544078361531612, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9392690658569336, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8639870882034302, + "num_tokens": 106383172.0, + "step": 2786 + }, + { + "epoch": 0.3545350464317517, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.0420005321502686, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8539519309997559, + "num_tokens": 106416916.0, + "step": 2787 + }, + { + "epoch": 0.3546622567103422, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.821709394454956, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8409782648086548, + "num_tokens": 106456969.0, + "step": 2788 + }, + { + "epoch": 0.3547894669889327, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.023118734359741, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8544615507125854, + "num_tokens": 106490476.0, + "step": 2789 + }, + { + "epoch": 0.35491667726752324, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.1266400814056396, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8564769625663757, + "num_tokens": 106525990.0, + "step": 2790 + }, + { + "epoch": 0.3550438875461137, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.1280171871185303, + "learning_rate": 1e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.831213116645813, + "num_tokens": 106562953.0, + "step": 2791 + }, + { + "epoch": 0.35517109782470424, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9217181205749512, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8582602739334106, + "num_tokens": 106599142.0, + "step": 2792 + }, + { + "epoch": 0.35529830810329477, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.7901332378387451, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8680226802825928, + "num_tokens": 106636743.0, + "step": 2793 + }, + { + "epoch": 0.35542551838188524, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.7987679243087769, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8605356812477112, + "num_tokens": 106678992.0, + "step": 2794 + }, + { + "epoch": 0.35555272866047577, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.2041380405426025, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8467563986778259, + "num_tokens": 106719884.0, + "step": 2795 + }, + { + "epoch": 0.3556799389390663, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.871795415878296, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8518134355545044, + "num_tokens": 106755224.0, + "step": 2796 + }, + { + "epoch": 0.35580714921765677, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8911826610565186, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.867796778678894, + "num_tokens": 106789366.0, + "step": 2797 + }, + { + "epoch": 0.3559343594962473, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9187771081924438, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8475724458694458, + "num_tokens": 106826382.0, + "step": 2798 + }, + { + "epoch": 0.3560615697748378, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9401230812072754, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8468719720840454, + "num_tokens": 106860027.0, + "step": 2799 + }, + { + "epoch": 0.3561887800534283, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.23820424079895, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8650752305984497, + "num_tokens": 106898684.0, + "step": 2800 + }, + { + "epoch": 0.35631599033201883, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.023810625076294, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8562864065170288, + "num_tokens": 106934496.0, + "step": 2801 + }, + { + "epoch": 0.35644320061060936, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8390672206878662, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8699958920478821, + "num_tokens": 106974335.0, + "step": 2802 + }, + { + "epoch": 0.35657041088919983, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.0936291217803955, + "learning_rate": 1e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.8318301439285278, + "num_tokens": 107013914.0, + "step": 2803 + }, + { + "epoch": 0.35669762116779036, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9563090801239014, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8604488968849182, + "num_tokens": 107053747.0, + "step": 2804 + }, + { + "epoch": 0.3568248314463809, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.934749722480774, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8466272354125977, + "num_tokens": 107095245.0, + "step": 2805 + }, + { + "epoch": 0.35695204172497136, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8648698329925537, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8571216464042664, + "num_tokens": 107133627.0, + "step": 2806 + }, + { + "epoch": 0.3570792520035619, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.753523588180542, + "learning_rate": 1e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8320486545562744, + "num_tokens": 107170128.0, + "step": 2807 + }, + { + "epoch": 0.3572064622821524, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8237125873565674, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8664889931678772, + "num_tokens": 107209137.0, + "step": 2808 + }, + { + "epoch": 0.3573336725607429, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.017449140548706, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8507503271102905, + "num_tokens": 107244866.0, + "step": 2809 + }, + { + "epoch": 0.3574608828393334, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9457149505615234, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8566325902938843, + "num_tokens": 107287399.0, + "step": 2810 + }, + { + "epoch": 0.35758809311792394, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.038771390914917, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8447934985160828, + "num_tokens": 107325627.0, + "step": 2811 + }, + { + "epoch": 0.3577153033965144, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.9651583433151245, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8711845278739929, + "num_tokens": 107359375.0, + "step": 2812 + }, + { + "epoch": 0.35784251367510495, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9462368488311768, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8425973653793335, + "num_tokens": 107400286.0, + "step": 2813 + }, + { + "epoch": 0.3579697239536955, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.0712952613830566, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8464009761810303, + "num_tokens": 107431768.0, + "step": 2814 + }, + { + "epoch": 0.35809693423228595, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.796797513961792, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8655635118484497, + "num_tokens": 107470875.0, + "step": 2815 + }, + { + "epoch": 0.3582241445108765, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9774936437606812, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8464508056640625, + "num_tokens": 107510988.0, + "step": 2816 + }, + { + "epoch": 0.358351354789467, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.8160591125488281, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.862949013710022, + "num_tokens": 107552423.0, + "step": 2817 + }, + { + "epoch": 0.3584785650680575, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.966544270515442, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8484494686126709, + "num_tokens": 107591693.0, + "step": 2818 + }, + { + "epoch": 0.358605775346648, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.913559913635254, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.86097651720047, + "num_tokens": 107626017.0, + "step": 2819 + }, + { + "epoch": 0.35873298562523853, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.08988618850708, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8714891076087952, + "num_tokens": 107664927.0, + "step": 2820 + }, + { + "epoch": 0.358860195903829, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9937329292297363, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.848587691783905, + "num_tokens": 107702487.0, + "step": 2821 + }, + { + "epoch": 0.35898740618241953, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.8194000720977783, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8693010807037354, + "num_tokens": 107741263.0, + "step": 2822 + }, + { + "epoch": 0.35911461646101006, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9452623128890991, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8587864637374878, + "num_tokens": 107781079.0, + "step": 2823 + }, + { + "epoch": 0.35924182673960053, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.803002119064331, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8740991353988647, + "num_tokens": 107819309.0, + "step": 2824 + }, + { + "epoch": 0.35936903701819106, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.055853843688965, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.858228325843811, + "num_tokens": 107853251.0, + "step": 2825 + }, + { + "epoch": 0.3594962472967816, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.70964777469635, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8765989542007446, + "num_tokens": 107893476.0, + "step": 2826 + }, + { + "epoch": 0.35962345757537206, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.7429521083831787, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8664087057113647, + "num_tokens": 107931204.0, + "step": 2827 + }, + { + "epoch": 0.3597506678539626, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.1445279121398926, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8377987742424011, + "num_tokens": 107966471.0, + "step": 2828 + }, + { + "epoch": 0.3598778781325531, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9355250597000122, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.857944130897522, + "num_tokens": 108005328.0, + "step": 2829 + }, + { + "epoch": 0.36000508841114365, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.193650722503662, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.845084547996521, + "num_tokens": 108046295.0, + "step": 2830 + }, + { + "epoch": 0.3601322986897341, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.139072895050049, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8585467338562012, + "num_tokens": 108078164.0, + "step": 2831 + }, + { + "epoch": 0.36025950896832465, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.8391157388687134, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8517712354660034, + "num_tokens": 108117105.0, + "step": 2832 + }, + { + "epoch": 0.3603867192469152, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.971190333366394, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8532260656356812, + "num_tokens": 108155301.0, + "step": 2833 + }, + { + "epoch": 0.36051392952550565, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9104151725769043, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8663148283958435, + "num_tokens": 108193091.0, + "step": 2834 + }, + { + "epoch": 0.3606411398040962, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.016143560409546, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8540983200073242, + "num_tokens": 108233719.0, + "step": 2835 + }, + { + "epoch": 0.3607683500826867, + "ewc_loss": 5.692243576049805e-06, + "grad_norm": 80.52664184570312, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8620669841766357, + "num_tokens": 108265775.0, + "step": 2836 + }, + { + "epoch": 0.3608955603612772, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.060809373855591, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8658884763717651, + "num_tokens": 108307953.0, + "step": 2837 + }, + { + "epoch": 0.3610227706398677, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.928544521331787, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8517500162124634, + "num_tokens": 108351998.0, + "step": 2838 + }, + { + "epoch": 0.36114998091845824, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 3.539313316345215, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8432980179786682, + "num_tokens": 108395968.0, + "step": 2839 + }, + { + "epoch": 0.3612771911970487, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.9149138927459717, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8365667462348938, + "num_tokens": 108433045.0, + "step": 2840 + }, + { + "epoch": 0.36140440147563924, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8338295221328735, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.851856529712677, + "num_tokens": 108470964.0, + "step": 2841 + }, + { + "epoch": 0.36153161175422976, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.7853072881698608, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8572642803192139, + "num_tokens": 108512572.0, + "step": 2842 + }, + { + "epoch": 0.36165882203282024, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.890632152557373, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8414961099624634, + "num_tokens": 108548526.0, + "step": 2843 + }, + { + "epoch": 0.36178603231141077, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.254859685897827, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8528344631195068, + "num_tokens": 108588447.0, + "step": 2844 + }, + { + "epoch": 0.3619132425900013, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.951927900314331, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8462753295898438, + "num_tokens": 108628426.0, + "step": 2845 + }, + { + "epoch": 0.36204045286859177, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.0978331565856934, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8507325649261475, + "num_tokens": 108661355.0, + "step": 2846 + }, + { + "epoch": 0.3621676631471823, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.8916945457458496, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.873077929019928, + "num_tokens": 108698505.0, + "step": 2847 + }, + { + "epoch": 0.3622948734257728, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.913206934928894, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8606957793235779, + "num_tokens": 108737409.0, + "step": 2848 + }, + { + "epoch": 0.3624220837043633, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9872019290924072, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8567953705787659, + "num_tokens": 108772969.0, + "step": 2849 + }, + { + "epoch": 0.3625492939829538, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.0174052715301514, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8626834154129028, + "num_tokens": 108805129.0, + "step": 2850 + }, + { + "epoch": 0.36267650426154435, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8476704359054565, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8407085537910461, + "num_tokens": 108841197.0, + "step": 2851 + }, + { + "epoch": 0.3628037145401348, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9006494283676147, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8694130778312683, + "num_tokens": 108876442.0, + "step": 2852 + }, + { + "epoch": 0.36293092481872535, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.9934989213943481, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.843090295791626, + "num_tokens": 108911432.0, + "step": 2853 + }, + { + "epoch": 0.3630581350973159, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.049663543701172, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8424878716468811, + "num_tokens": 108952977.0, + "step": 2854 + }, + { + "epoch": 0.36318534537590635, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9903651475906372, + "learning_rate": 1e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.833155632019043, + "num_tokens": 108995602.0, + "step": 2855 + }, + { + "epoch": 0.3633125556544969, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.768972396850586, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8582059741020203, + "num_tokens": 109040226.0, + "step": 2856 + }, + { + "epoch": 0.3634397659330874, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.030927896499634, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8571308851242065, + "num_tokens": 109082898.0, + "step": 2857 + }, + { + "epoch": 0.3635669762116779, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.99044930934906, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8469012975692749, + "num_tokens": 109122460.0, + "step": 2858 + }, + { + "epoch": 0.3636941864902684, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8320436477661133, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8551462292671204, + "num_tokens": 109161680.0, + "step": 2859 + }, + { + "epoch": 0.36382139676885894, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.7571169137954712, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8750064373016357, + "num_tokens": 109198943.0, + "step": 2860 + }, + { + "epoch": 0.3639486070474494, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8813931941986084, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8628399968147278, + "num_tokens": 109235719.0, + "step": 2861 + }, + { + "epoch": 0.36407581732603994, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9493712186813354, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8487455248832703, + "num_tokens": 109272681.0, + "step": 2862 + }, + { + "epoch": 0.36420302760463047, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.4140849113464355, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8437111377716064, + "num_tokens": 109306114.0, + "step": 2863 + }, + { + "epoch": 0.36433023788322094, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.1924681663513184, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8473117351531982, + "num_tokens": 109337728.0, + "step": 2864 + }, + { + "epoch": 0.36445744816181147, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.905328631401062, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8469639420509338, + "num_tokens": 109377809.0, + "step": 2865 + }, + { + "epoch": 0.364584658440402, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.865153193473816, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8690911531448364, + "num_tokens": 109413499.0, + "step": 2866 + }, + { + "epoch": 0.36471186871899247, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.7576936483383179, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.845668613910675, + "num_tokens": 109460961.0, + "step": 2867 + }, + { + "epoch": 0.364839078997583, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.75128173828125, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8656062483787537, + "num_tokens": 109500615.0, + "step": 2868 + }, + { + "epoch": 0.36496628927617353, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.0119481086730957, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8589358329772949, + "num_tokens": 109533921.0, + "step": 2869 + }, + { + "epoch": 0.365093499554764, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.018103837966919, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8442365527153015, + "num_tokens": 109567069.0, + "step": 2870 + }, + { + "epoch": 0.36522070983335453, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9464654922485352, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8682765364646912, + "num_tokens": 109603696.0, + "step": 2871 + }, + { + "epoch": 0.36534792011194506, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.786819577217102, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8726234436035156, + "num_tokens": 109644590.0, + "step": 2872 + }, + { + "epoch": 0.36547513039053553, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 16.996936798095703, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8510154485702515, + "num_tokens": 109678869.0, + "step": 2873 + }, + { + "epoch": 0.36560234066912606, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.069150686264038, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8501958847045898, + "num_tokens": 109722517.0, + "step": 2874 + }, + { + "epoch": 0.3657295509477166, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 2.0496644973754883, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8520570993423462, + "num_tokens": 109757869.0, + "step": 2875 + }, + { + "epoch": 0.36585676122630706, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 2.110628128051758, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8507128953933716, + "num_tokens": 109793162.0, + "step": 2876 + }, + { + "epoch": 0.3659839715048976, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.839795708656311, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8627849817276001, + "num_tokens": 109832942.0, + "step": 2877 + }, + { + "epoch": 0.3661111817834881, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.7882583141326904, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8547097444534302, + "num_tokens": 109876230.0, + "step": 2878 + }, + { + "epoch": 0.3662383920620786, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9025540351867676, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8460481762886047, + "num_tokens": 109911844.0, + "step": 2879 + }, + { + "epoch": 0.3663656023406691, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.8527880907058716, + "learning_rate": 1e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8430823087692261, + "num_tokens": 109957619.0, + "step": 2880 + }, + { + "epoch": 0.36649281261925964, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.9396218061447144, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.84410160779953, + "num_tokens": 109999117.0, + "step": 2881 + }, + { + "epoch": 0.3666200228978502, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8534064292907715, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8625621795654297, + "num_tokens": 110039557.0, + "step": 2882 + }, + { + "epoch": 0.36674723317644065, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.9212520122528076, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8459951281547546, + "num_tokens": 110074418.0, + "step": 2883 + }, + { + "epoch": 0.3668744434550312, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.8210248947143555, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8604012131690979, + "num_tokens": 110109255.0, + "step": 2884 + }, + { + "epoch": 0.3670016537336217, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.7872289419174194, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8625265955924988, + "num_tokens": 110150180.0, + "step": 2885 + }, + { + "epoch": 0.3671288640122122, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.2172110080718994, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8389491438865662, + "num_tokens": 110193653.0, + "step": 2886 + }, + { + "epoch": 0.3672560742908027, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8790794610977173, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8636854887008667, + "num_tokens": 110231020.0, + "step": 2887 + }, + { + "epoch": 0.36738328456939323, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.8176708221435547, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8715568780899048, + "num_tokens": 110269397.0, + "step": 2888 + }, + { + "epoch": 0.3675104948479837, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.814656376838684, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8698660135269165, + "num_tokens": 110311188.0, + "step": 2889 + }, + { + "epoch": 0.36763770512657423, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.1259658336639404, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8600465059280396, + "num_tokens": 110343801.0, + "step": 2890 + }, + { + "epoch": 0.36776491540516476, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.130836009979248, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8382680416107178, + "num_tokens": 110385830.0, + "step": 2891 + }, + { + "epoch": 0.36789212568375523, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 2.29258394241333, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.854562520980835, + "num_tokens": 110421494.0, + "step": 2892 + }, + { + "epoch": 0.36801933596234576, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.7373676300048828, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8550422191619873, + "num_tokens": 110463132.0, + "step": 2893 + }, + { + "epoch": 0.3681465462409363, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8358469009399414, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.857776403427124, + "num_tokens": 110498330.0, + "step": 2894 + }, + { + "epoch": 0.36827375651952676, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 3.589596748352051, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8637380599975586, + "num_tokens": 110534438.0, + "step": 2895 + }, + { + "epoch": 0.3684009667981173, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8538942337036133, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8512948751449585, + "num_tokens": 110572059.0, + "step": 2896 + }, + { + "epoch": 0.3685281770767078, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 2.0998027324676514, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8578881025314331, + "num_tokens": 110611012.0, + "step": 2897 + }, + { + "epoch": 0.3686553873552983, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 2.1501572132110596, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8637336492538452, + "num_tokens": 110647429.0, + "step": 2898 + }, + { + "epoch": 0.3687825976338888, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8655474185943604, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8497439622879028, + "num_tokens": 110685052.0, + "step": 2899 + }, + { + "epoch": 0.36890980791247935, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8948601484298706, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.836015522480011, + "num_tokens": 110725222.0, + "step": 2900 + }, + { + "epoch": 0.3690370181910698, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.84121572971344, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8441795110702515, + "num_tokens": 110764314.0, + "step": 2901 + }, + { + "epoch": 0.36916422846966035, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.6965185403823853, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.866926908493042, + "num_tokens": 110803585.0, + "step": 2902 + }, + { + "epoch": 0.3692914387482509, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.7669671773910522, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.850318431854248, + "num_tokens": 110846532.0, + "step": 2903 + }, + { + "epoch": 0.36941864902684135, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8005421161651611, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8538041710853577, + "num_tokens": 110889807.0, + "step": 2904 + }, + { + "epoch": 0.3695458593054319, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.7660877704620361, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.835877537727356, + "num_tokens": 110930567.0, + "step": 2905 + }, + { + "epoch": 0.3696730695840224, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 2.0521726608276367, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8630505800247192, + "num_tokens": 110968725.0, + "step": 2906 + }, + { + "epoch": 0.3698002798626129, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8334242105484009, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8787760138511658, + "num_tokens": 111002724.0, + "step": 2907 + }, + { + "epoch": 0.3699274901412034, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 2.0492959022521973, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8554880023002625, + "num_tokens": 111036668.0, + "step": 2908 + }, + { + "epoch": 0.37005470041979394, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.9414393901824951, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8449058532714844, + "num_tokens": 111077712.0, + "step": 2909 + }, + { + "epoch": 0.3701819106983844, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.8274222612380981, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8464758396148682, + "num_tokens": 111118856.0, + "step": 2910 + }, + { + "epoch": 0.37030912097697494, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.844915747642517, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8470646142959595, + "num_tokens": 111156712.0, + "step": 2911 + }, + { + "epoch": 0.37043633125556547, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.8765918016433716, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8566820621490479, + "num_tokens": 111195542.0, + "step": 2912 + }, + { + "epoch": 0.37056354153415594, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.80474853515625, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8367646932601929, + "num_tokens": 111237827.0, + "step": 2913 + }, + { + "epoch": 0.37069075181274647, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.7679568529129028, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8630703687667847, + "num_tokens": 111278802.0, + "step": 2914 + }, + { + "epoch": 0.370817962091337, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.961968183517456, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8527873754501343, + "num_tokens": 111313795.0, + "step": 2915 + }, + { + "epoch": 0.37094517236992747, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.770652174949646, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8634305596351624, + "num_tokens": 111353798.0, + "step": 2916 + }, + { + "epoch": 0.371072382648518, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.9428619146347046, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8561821579933167, + "num_tokens": 111390613.0, + "step": 2917 + }, + { + "epoch": 0.3711995929271085, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.781348705291748, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8558194637298584, + "num_tokens": 111432043.0, + "step": 2918 + }, + { + "epoch": 0.371326803205699, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.724602460861206, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8617801070213318, + "num_tokens": 111476575.0, + "step": 2919 + }, + { + "epoch": 0.3714540134842895, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.919402003288269, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8564852476119995, + "num_tokens": 111515304.0, + "step": 2920 + }, + { + "epoch": 0.37158122376288005, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.8672113418579102, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8650403618812561, + "num_tokens": 111550246.0, + "step": 2921 + }, + { + "epoch": 0.3717084340414705, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 2.24634051322937, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.856786847114563, + "num_tokens": 111592189.0, + "step": 2922 + }, + { + "epoch": 0.37183564432006105, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 2.2965893745422363, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8590238690376282, + "num_tokens": 111638177.0, + "step": 2923 + }, + { + "epoch": 0.3719628545986516, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 2.076580762863159, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8558679819107056, + "num_tokens": 111674365.0, + "step": 2924 + }, + { + "epoch": 0.37209006487724205, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 2.097119092941284, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8508092164993286, + "num_tokens": 111714571.0, + "step": 2925 + }, + { + "epoch": 0.3722172751558326, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.902868390083313, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.856447160243988, + "num_tokens": 111755214.0, + "step": 2926 + }, + { + "epoch": 0.3723444854344231, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.9539138078689575, + "learning_rate": 1e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.8284852504730225, + "num_tokens": 111796516.0, + "step": 2927 + }, + { + "epoch": 0.3724716957130136, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8911848068237305, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8406917452812195, + "num_tokens": 111832615.0, + "step": 2928 + }, + { + "epoch": 0.3725989059916041, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.881223201751709, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8459935188293457, + "num_tokens": 111874682.0, + "step": 2929 + }, + { + "epoch": 0.37272611627019464, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8886061906814575, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8430144786834717, + "num_tokens": 111916369.0, + "step": 2930 + }, + { + "epoch": 0.37285332654878517, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 2.1086928844451904, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8491207361221313, + "num_tokens": 111950671.0, + "step": 2931 + }, + { + "epoch": 0.37298053682737564, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.8389275074005127, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8390641808509827, + "num_tokens": 111988846.0, + "step": 2932 + }, + { + "epoch": 0.37310774710596617, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.9589849710464478, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8646926879882812, + "num_tokens": 112022853.0, + "step": 2933 + }, + { + "epoch": 0.3732349573845567, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.7974313497543335, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8523167967796326, + "num_tokens": 112065631.0, + "step": 2934 + }, + { + "epoch": 0.37336216766314717, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.887775182723999, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8451229333877563, + "num_tokens": 112107236.0, + "step": 2935 + }, + { + "epoch": 0.3734893779417377, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.7979923486709595, + "learning_rate": 1e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8338863253593445, + "num_tokens": 112153914.0, + "step": 2936 + }, + { + "epoch": 0.3736165882203282, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.9692676067352295, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8581939339637756, + "num_tokens": 112187596.0, + "step": 2937 + }, + { + "epoch": 0.3737437984989187, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.9367222785949707, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8533376455307007, + "num_tokens": 112222409.0, + "step": 2938 + }, + { + "epoch": 0.37387100877750923, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.912909984588623, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8428157567977905, + "num_tokens": 112257699.0, + "step": 2939 + }, + { + "epoch": 0.37399821905609976, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 2.0269768238067627, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8632056713104248, + "num_tokens": 112292431.0, + "step": 2940 + }, + { + "epoch": 0.37412542933469023, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.8984391689300537, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8566870093345642, + "num_tokens": 112331241.0, + "step": 2941 + }, + { + "epoch": 0.37425263961328076, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9210084676742554, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8514366149902344, + "num_tokens": 112368486.0, + "step": 2942 + }, + { + "epoch": 0.3743798498918713, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.038491725921631, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.846043586730957, + "num_tokens": 112409638.0, + "step": 2943 + }, + { + "epoch": 0.37450706017046176, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.199127674102783, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8501603603363037, + "num_tokens": 112448257.0, + "step": 2944 + }, + { + "epoch": 0.3746342704490523, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.771179437637329, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8702179193496704, + "num_tokens": 112488739.0, + "step": 2945 + }, + { + "epoch": 0.3747614807276428, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8923916816711426, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.839599609375, + "num_tokens": 112528621.0, + "step": 2946 + }, + { + "epoch": 0.3748886910062333, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8933138847351074, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8503483533859253, + "num_tokens": 112564615.0, + "step": 2947 + }, + { + "epoch": 0.3750159012848238, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.2115800380706787, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.844393253326416, + "num_tokens": 112598190.0, + "step": 2948 + }, + { + "epoch": 0.37514311156341434, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.034891128540039, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8547027111053467, + "num_tokens": 112632078.0, + "step": 2949 + }, + { + "epoch": 0.3752703218420048, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9691280126571655, + "learning_rate": 1e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8309820294380188, + "num_tokens": 112667848.0, + "step": 2950 + }, + { + "epoch": 0.37539753212059535, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8017666339874268, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.852286696434021, + "num_tokens": 112707037.0, + "step": 2951 + }, + { + "epoch": 0.3755247423991859, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.5826207399368286, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8642416000366211, + "num_tokens": 112751921.0, + "step": 2952 + }, + { + "epoch": 0.37565195267777635, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 5.40201997756958, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8593062162399292, + "num_tokens": 112793061.0, + "step": 2953 + }, + { + "epoch": 0.3757791629563669, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8954789638519287, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8637985587120056, + "num_tokens": 112833852.0, + "step": 2954 + }, + { + "epoch": 0.3759063732349574, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.143268585205078, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.850862979888916, + "num_tokens": 112866718.0, + "step": 2955 + }, + { + "epoch": 0.3760335835135479, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.093167304992676, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.855384349822998, + "num_tokens": 112909420.0, + "step": 2956 + }, + { + "epoch": 0.3761607937921384, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.050095558166504, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8570876121520996, + "num_tokens": 112944570.0, + "step": 2957 + }, + { + "epoch": 0.37628800407072893, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.7701078653335571, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.866753101348877, + "num_tokens": 112984107.0, + "step": 2958 + }, + { + "epoch": 0.3764152143493194, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.7082895040512085, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8781919479370117, + "num_tokens": 113023066.0, + "step": 2959 + }, + { + "epoch": 0.37654242462790993, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8643479347229004, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8537379503250122, + "num_tokens": 113060093.0, + "step": 2960 + }, + { + "epoch": 0.37666963490650046, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9649277925491333, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.846611738204956, + "num_tokens": 113095940.0, + "step": 2961 + }, + { + "epoch": 0.37679684518509093, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8364626169204712, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8660461902618408, + "num_tokens": 113130699.0, + "step": 2962 + }, + { + "epoch": 0.37692405546368146, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8117159605026245, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8545944690704346, + "num_tokens": 113169145.0, + "step": 2963 + }, + { + "epoch": 0.377051265742272, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9962778091430664, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8686208128929138, + "num_tokens": 113201987.0, + "step": 2964 + }, + { + "epoch": 0.37717847602086246, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.0935049057006836, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8616865873336792, + "num_tokens": 113239588.0, + "step": 2965 + }, + { + "epoch": 0.377305686299453, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9616748094558716, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8557224273681641, + "num_tokens": 113277075.0, + "step": 2966 + }, + { + "epoch": 0.3774328965780435, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9192981719970703, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8831104040145874, + "num_tokens": 113313708.0, + "step": 2967 + }, + { + "epoch": 0.377560106856634, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.1542539596557617, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8391170501708984, + "num_tokens": 113344027.0, + "step": 2968 + }, + { + "epoch": 0.3776873171352245, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.819733738899231, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8584722876548767, + "num_tokens": 113388444.0, + "step": 2969 + }, + { + "epoch": 0.37781452741381505, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8029797077178955, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.850818932056427, + "num_tokens": 113428660.0, + "step": 2970 + }, + { + "epoch": 0.3779417376924055, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.185842752456665, + "learning_rate": 1e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8332604765892029, + "num_tokens": 113469160.0, + "step": 2971 + }, + { + "epoch": 0.37806894797099605, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.961389422416687, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8313934803009033, + "num_tokens": 113504291.0, + "step": 2972 + }, + { + "epoch": 0.3781961582495866, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.693660855293274, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.855660080909729, + "num_tokens": 113548085.0, + "step": 2973 + }, + { + "epoch": 0.37832336852817705, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.965653896331787, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8598980903625488, + "num_tokens": 113581664.0, + "step": 2974 + }, + { + "epoch": 0.3784505788067676, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8744785785675049, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8675400614738464, + "num_tokens": 113618552.0, + "step": 2975 + }, + { + "epoch": 0.3785777890853581, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.3057942390441895, + "learning_rate": 1e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8344375491142273, + "num_tokens": 113653406.0, + "step": 2976 + }, + { + "epoch": 0.3787049993639486, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8570669889450073, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8647498488426208, + "num_tokens": 113693803.0, + "step": 2977 + }, + { + "epoch": 0.3788322096425391, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.7472552061080933, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8697125911712646, + "num_tokens": 113734637.0, + "step": 2978 + }, + { + "epoch": 0.37895941992112964, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8261940479278564, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8483130931854248, + "num_tokens": 113776579.0, + "step": 2979 + }, + { + "epoch": 0.37908663019972016, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8305596113204956, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8457862138748169, + "num_tokens": 113818857.0, + "step": 2980 + }, + { + "epoch": 0.37921384047831064, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9902828931808472, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8470181226730347, + "num_tokens": 113853248.0, + "step": 2981 + }, + { + "epoch": 0.37934105075690117, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8040485382080078, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8600412011146545, + "num_tokens": 113894206.0, + "step": 2982 + }, + { + "epoch": 0.3794682610354917, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.022406578063965, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8622559309005737, + "num_tokens": 113931309.0, + "step": 2983 + }, + { + "epoch": 0.37959547131408217, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8297061920166016, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8443708419799805, + "num_tokens": 113976559.0, + "step": 2984 + }, + { + "epoch": 0.3797226815926727, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.0502452850341797, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8543744683265686, + "num_tokens": 114010814.0, + "step": 2985 + }, + { + "epoch": 0.3798498918712632, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.796942949295044, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8733294606208801, + "num_tokens": 114049389.0, + "step": 2986 + }, + { + "epoch": 0.3799771021498537, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.4369566440582275, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8736122846603394, + "num_tokens": 114081463.0, + "step": 2987 + }, + { + "epoch": 0.3801043124284442, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.000769853591919, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8708959221839905, + "num_tokens": 114117175.0, + "step": 2988 + }, + { + "epoch": 0.38023152270703475, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.1555705070495605, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8439948558807373, + "num_tokens": 114153764.0, + "step": 2989 + }, + { + "epoch": 0.3803587329856252, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.915349006652832, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8543130159378052, + "num_tokens": 114191487.0, + "step": 2990 + }, + { + "epoch": 0.38048594326421575, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8945363759994507, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8305338621139526, + "num_tokens": 114233880.0, + "step": 2991 + }, + { + "epoch": 0.3806131535428063, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.050266742706299, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8529583811759949, + "num_tokens": 114269238.0, + "step": 2992 + }, + { + "epoch": 0.38074036382139675, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.7750859260559082, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8676469922065735, + "num_tokens": 114307645.0, + "step": 2993 + }, + { + "epoch": 0.3808675740999873, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9725419282913208, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.858771562576294, + "num_tokens": 114341800.0, + "step": 2994 + }, + { + "epoch": 0.3809947843785778, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.785285234451294, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8641008138656616, + "num_tokens": 114378363.0, + "step": 2995 + }, + { + "epoch": 0.3811219946571683, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.0208017826080322, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8438597917556763, + "num_tokens": 114414489.0, + "step": 2996 + }, + { + "epoch": 0.3812492049357588, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.0156490802764893, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8497154116630554, + "num_tokens": 114449197.0, + "step": 2997 + }, + { + "epoch": 0.38137641521434934, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.023646116256714, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8529305458068848, + "num_tokens": 114478654.0, + "step": 2998 + }, + { + "epoch": 0.3815036254929398, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.811143398284912, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8634893894195557, + "num_tokens": 114521807.0, + "step": 2999 + }, + { + "epoch": 0.38163083577153034, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8591219186782837, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8687433004379272, + "num_tokens": 114558297.0, + "step": 3000 + }, + { + "epoch": 0.38175804605012087, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9127593040466309, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8618959784507751, + "num_tokens": 114599630.0, + "step": 3001 + }, + { + "epoch": 0.38188525632871134, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 7.7726545333862305, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.852752685546875, + "num_tokens": 114638697.0, + "step": 3002 + }, + { + "epoch": 0.38201246660730187, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.007333517074585, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8549431562423706, + "num_tokens": 114676106.0, + "step": 3003 + }, + { + "epoch": 0.3821396768858924, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9801017045974731, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8534170985221863, + "num_tokens": 114712026.0, + "step": 3004 + }, + { + "epoch": 0.38226688716448287, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8989278078079224, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8340545892715454, + "num_tokens": 114752348.0, + "step": 3005 + }, + { + "epoch": 0.3823940974430734, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.3636937141418457, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8605679273605347, + "num_tokens": 114789922.0, + "step": 3006 + }, + { + "epoch": 0.38252130772166393, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.7679437398910522, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8497165441513062, + "num_tokens": 114838834.0, + "step": 3007 + }, + { + "epoch": 0.3826485180002544, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.1154630184173584, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8347046375274658, + "num_tokens": 114873320.0, + "step": 3008 + }, + { + "epoch": 0.38277572827884493, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9317879676818848, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.866544783115387, + "num_tokens": 114908293.0, + "step": 3009 + }, + { + "epoch": 0.38290293855743546, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.883304476737976, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8454346656799316, + "num_tokens": 114950083.0, + "step": 3010 + }, + { + "epoch": 0.38303014883602593, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.0013461112976074, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8598027229309082, + "num_tokens": 114982353.0, + "step": 3011 + }, + { + "epoch": 0.38315735911461646, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8519625663757324, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8613560199737549, + "num_tokens": 115017668.0, + "step": 3012 + }, + { + "epoch": 0.383284569393207, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 7.799086093902588, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8664757609367371, + "num_tokens": 115055266.0, + "step": 3013 + }, + { + "epoch": 0.38341177967179746, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.0904932022094727, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8506537675857544, + "num_tokens": 115089728.0, + "step": 3014 + }, + { + "epoch": 0.383538989950388, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9350991249084473, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.840469241142273, + "num_tokens": 115135192.0, + "step": 3015 + }, + { + "epoch": 0.3836662002289785, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.218299150466919, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.837628960609436, + "num_tokens": 115170350.0, + "step": 3016 + }, + { + "epoch": 0.383793410507569, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9874976873397827, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8625604510307312, + "num_tokens": 115202724.0, + "step": 3017 + }, + { + "epoch": 0.3839206207861595, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8229703903198242, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8559799790382385, + "num_tokens": 115239274.0, + "step": 3018 + }, + { + "epoch": 0.38404783106475004, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9268745183944702, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.862351655960083, + "num_tokens": 115278330.0, + "step": 3019 + }, + { + "epoch": 0.3841750413433405, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8515301942825317, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8520334959030151, + "num_tokens": 115321258.0, + "step": 3020 + }, + { + "epoch": 0.38430225162193105, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.830345630645752, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8565546274185181, + "num_tokens": 115362609.0, + "step": 3021 + }, + { + "epoch": 0.3844294619005216, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9660675525665283, + "learning_rate": 1e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.8291696906089783, + "num_tokens": 115405814.0, + "step": 3022 + }, + { + "epoch": 0.38455667217911205, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.0394511222839355, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8502135872840881, + "num_tokens": 115440856.0, + "step": 3023 + }, + { + "epoch": 0.3846838824577026, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.0855391025543213, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8619610667228699, + "num_tokens": 115482582.0, + "step": 3024 + }, + { + "epoch": 0.3848110927362931, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.89652681350708, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8463462591171265, + "num_tokens": 115527268.0, + "step": 3025 + }, + { + "epoch": 0.3849383030148836, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9559012651443481, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8499822616577148, + "num_tokens": 115563757.0, + "step": 3026 + }, + { + "epoch": 0.3850655132934741, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.006376266479492, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8591294288635254, + "num_tokens": 115599629.0, + "step": 3027 + }, + { + "epoch": 0.38519272357206463, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.143810272216797, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8596735000610352, + "num_tokens": 115634261.0, + "step": 3028 + }, + { + "epoch": 0.3853199338506551, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.865332007408142, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8676285743713379, + "num_tokens": 115671150.0, + "step": 3029 + }, + { + "epoch": 0.38544714412924563, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.087794065475464, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8683788776397705, + "num_tokens": 115714328.0, + "step": 3030 + }, + { + "epoch": 0.38557435440783616, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.0183513164520264, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8712700009346008, + "num_tokens": 115747118.0, + "step": 3031 + }, + { + "epoch": 0.3857015646864267, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8728282451629639, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8353919982910156, + "num_tokens": 115790309.0, + "step": 3032 + }, + { + "epoch": 0.38582877496501716, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8780219554901123, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8672044277191162, + "num_tokens": 115825630.0, + "step": 3033 + }, + { + "epoch": 0.3859559852436077, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.8465765714645386, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8631635904312134, + "num_tokens": 115858848.0, + "step": 3034 + }, + { + "epoch": 0.3860831955221982, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.7046772241592407, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8646646738052368, + "num_tokens": 115900024.0, + "step": 3035 + }, + { + "epoch": 0.3862104058007887, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.9018723964691162, + "learning_rate": 1e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.831682562828064, + "num_tokens": 115938899.0, + "step": 3036 + }, + { + "epoch": 0.3863376160793792, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.122943878173828, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8621711134910583, + "num_tokens": 115972199.0, + "step": 3037 + }, + { + "epoch": 0.38646482635796975, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 8.645476341247559, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8647289276123047, + "num_tokens": 116011506.0, + "step": 3038 + }, + { + "epoch": 0.3865920366365602, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.0146114826202393, + "learning_rate": 1e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8329923748970032, + "num_tokens": 116051428.0, + "step": 3039 + }, + { + "epoch": 0.38671924691515075, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9777748584747314, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8644367456436157, + "num_tokens": 116090307.0, + "step": 3040 + }, + { + "epoch": 0.3868464571937413, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.2928152084350586, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8473133444786072, + "num_tokens": 116123381.0, + "step": 3041 + }, + { + "epoch": 0.38697366747233175, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.922910451889038, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8593018054962158, + "num_tokens": 116157755.0, + "step": 3042 + }, + { + "epoch": 0.3871008777509223, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8044853210449219, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8561928272247314, + "num_tokens": 116197371.0, + "step": 3043 + }, + { + "epoch": 0.3872280880295128, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8597861528396606, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8406495451927185, + "num_tokens": 116240544.0, + "step": 3044 + }, + { + "epoch": 0.3873552983081033, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9911918640136719, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8529441952705383, + "num_tokens": 116272079.0, + "step": 3045 + }, + { + "epoch": 0.3874825085866938, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9505618810653687, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8534960150718689, + "num_tokens": 116309641.0, + "step": 3046 + }, + { + "epoch": 0.38760971886528434, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.072965145111084, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8513857126235962, + "num_tokens": 116344470.0, + "step": 3047 + }, + { + "epoch": 0.3877369291438748, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.042051076889038, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8582350015640259, + "num_tokens": 116380177.0, + "step": 3048 + }, + { + "epoch": 0.38786413942246534, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9152159690856934, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.866628885269165, + "num_tokens": 116416482.0, + "step": 3049 + }, + { + "epoch": 0.38799134970105587, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.983662486076355, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8426899909973145, + "num_tokens": 116456103.0, + "step": 3050 + }, + { + "epoch": 0.38811855997964634, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8916478157043457, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8623998761177063, + "num_tokens": 116496810.0, + "step": 3051 + }, + { + "epoch": 0.38824577025823687, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.97188401222229, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8524200916290283, + "num_tokens": 116532999.0, + "step": 3052 + }, + { + "epoch": 0.3883729805368274, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.984981656074524, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8642935752868652, + "num_tokens": 116565700.0, + "step": 3053 + }, + { + "epoch": 0.38850019081541787, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.909900188446045, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8597899675369263, + "num_tokens": 116602798.0, + "step": 3054 + }, + { + "epoch": 0.3886274010940084, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8747581243515015, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8677778244018555, + "num_tokens": 116638743.0, + "step": 3055 + }, + { + "epoch": 0.3887546113725989, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.950498342514038, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.868696928024292, + "num_tokens": 116672290.0, + "step": 3056 + }, + { + "epoch": 0.3888818216511894, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.8285526037216187, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.857802152633667, + "num_tokens": 116712593.0, + "step": 3057 + }, + { + "epoch": 0.3890090319297799, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.4188690185546875, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8556100726127625, + "num_tokens": 116748663.0, + "step": 3058 + }, + { + "epoch": 0.38913624220837045, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.779738426208496, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8517800569534302, + "num_tokens": 116796018.0, + "step": 3059 + }, + { + "epoch": 0.3892634524869609, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.9859988689422607, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8536760807037354, + "num_tokens": 116837287.0, + "step": 3060 + }, + { + "epoch": 0.38939066276555145, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.120122194290161, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8392938375473022, + "num_tokens": 116872163.0, + "step": 3061 + }, + { + "epoch": 0.389517873044142, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.7992359399795532, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8638931512832642, + "num_tokens": 116913445.0, + "step": 3062 + }, + { + "epoch": 0.38964508332273246, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.8202509880065918, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8475054502487183, + "num_tokens": 116954469.0, + "step": 3063 + }, + { + "epoch": 0.389772293601323, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.9504073858261108, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8552899956703186, + "num_tokens": 116994339.0, + "step": 3064 + }, + { + "epoch": 0.3898995038799135, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.8952879905700684, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8682963848114014, + "num_tokens": 117030867.0, + "step": 3065 + }, + { + "epoch": 0.390026714158504, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.091249465942383, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8642146587371826, + "num_tokens": 117068309.0, + "step": 3066 + }, + { + "epoch": 0.3901539244370945, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9039382934570312, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8508216142654419, + "num_tokens": 117105278.0, + "step": 3067 + }, + { + "epoch": 0.39028113471568504, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9489258527755737, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8484904170036316, + "num_tokens": 117143096.0, + "step": 3068 + }, + { + "epoch": 0.3904083449942755, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.90701425075531, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8560419082641602, + "num_tokens": 117179638.0, + "step": 3069 + }, + { + "epoch": 0.39053555527286604, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.0875730514526367, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8684372901916504, + "num_tokens": 117211560.0, + "step": 3070 + }, + { + "epoch": 0.39066276555145657, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.880871295928955, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.848736047744751, + "num_tokens": 117250622.0, + "step": 3071 + }, + { + "epoch": 0.39078997583004704, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.020977020263672, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8760978579521179, + "num_tokens": 117287600.0, + "step": 3072 + }, + { + "epoch": 0.39091718610863757, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.2019829750061035, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.840688169002533, + "num_tokens": 117328188.0, + "step": 3073 + }, + { + "epoch": 0.3910443963872281, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.9134504795074463, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8631214499473572, + "num_tokens": 117367282.0, + "step": 3074 + }, + { + "epoch": 0.39117160666581857, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.993057131767273, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8668401837348938, + "num_tokens": 117403376.0, + "step": 3075 + }, + { + "epoch": 0.3912988169444091, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.059230327606201, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8476169109344482, + "num_tokens": 117438661.0, + "step": 3076 + }, + { + "epoch": 0.39142602722299963, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.1135692596435547, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8600740432739258, + "num_tokens": 117477351.0, + "step": 3077 + }, + { + "epoch": 0.3915532375015901, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.8595060110092163, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8540457487106323, + "num_tokens": 117512773.0, + "step": 3078 + }, + { + "epoch": 0.39168044778018063, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.9711391925811768, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8402177095413208, + "num_tokens": 117555370.0, + "step": 3079 + }, + { + "epoch": 0.39180765805877116, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.0501115322113037, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8628914952278137, + "num_tokens": 117594208.0, + "step": 3080 + }, + { + "epoch": 0.3919348683373617, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.8195384740829468, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8585701584815979, + "num_tokens": 117632715.0, + "step": 3081 + }, + { + "epoch": 0.39206207861595216, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.009661912918091, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8512397408485413, + "num_tokens": 117669221.0, + "step": 3082 + }, + { + "epoch": 0.3921892888945427, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.832360029220581, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8499419093132019, + "num_tokens": 117709559.0, + "step": 3083 + }, + { + "epoch": 0.3923164991731332, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.8211086988449097, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8629623651504517, + "num_tokens": 117745240.0, + "step": 3084 + }, + { + "epoch": 0.3924437094517237, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.7219873666763306, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.859218418598175, + "num_tokens": 117787897.0, + "step": 3085 + }, + { + "epoch": 0.3925709197303142, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.9835394620895386, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8521003723144531, + "num_tokens": 117826862.0, + "step": 3086 + }, + { + "epoch": 0.39269813000890474, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.7767791748046875, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8511141538619995, + "num_tokens": 117866932.0, + "step": 3087 + }, + { + "epoch": 0.3928253402874952, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.9153642654418945, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8568781018257141, + "num_tokens": 117905538.0, + "step": 3088 + }, + { + "epoch": 0.39295255056608575, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.9992045164108276, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8440234065055847, + "num_tokens": 117945123.0, + "step": 3089 + }, + { + "epoch": 0.3930797608446763, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8512382507324219, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8582903742790222, + "num_tokens": 117988743.0, + "step": 3090 + }, + { + "epoch": 0.39320697112326675, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.800409197807312, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8695842623710632, + "num_tokens": 118026078.0, + "step": 3091 + }, + { + "epoch": 0.3933341814018573, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8994741439819336, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8666377067565918, + "num_tokens": 118060105.0, + "step": 3092 + }, + { + "epoch": 0.3934613916804478, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8405975103378296, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8847715258598328, + "num_tokens": 118089677.0, + "step": 3093 + }, + { + "epoch": 0.3935886019590383, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.7209060192108154, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8686547875404358, + "num_tokens": 118128449.0, + "step": 3094 + }, + { + "epoch": 0.3937158122376288, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.8249880075454712, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.865789532661438, + "num_tokens": 118168411.0, + "step": 3095 + }, + { + "epoch": 0.39384302251621933, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.888571858406067, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8661286234855652, + "num_tokens": 118206097.0, + "step": 3096 + }, + { + "epoch": 0.3939702327948098, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.7858010530471802, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8692212700843811, + "num_tokens": 118244429.0, + "step": 3097 + }, + { + "epoch": 0.39409744307340033, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.9076229333877563, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8621375560760498, + "num_tokens": 118279125.0, + "step": 3098 + }, + { + "epoch": 0.39422465335199086, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.9840227365493774, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8563076257705688, + "num_tokens": 118313156.0, + "step": 3099 + }, + { + "epoch": 0.39435186363058133, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.976318120956421, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8422717452049255, + "num_tokens": 118349868.0, + "step": 3100 + }, + { + "epoch": 0.39447907390917186, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.9062869548797607, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8464727401733398, + "num_tokens": 118394998.0, + "step": 3101 + }, + { + "epoch": 0.3946062841877624, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.7908920049667358, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8676154613494873, + "num_tokens": 118436002.0, + "step": 3102 + }, + { + "epoch": 0.39473349446635286, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.774479866027832, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8610889911651611, + "num_tokens": 118475094.0, + "step": 3103 + }, + { + "epoch": 0.3948607047449434, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.776551604270935, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8535716533660889, + "num_tokens": 118514164.0, + "step": 3104 + }, + { + "epoch": 0.3949879150235339, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.90458083152771, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8523703813552856, + "num_tokens": 118554271.0, + "step": 3105 + }, + { + "epoch": 0.3951151253021244, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8407045602798462, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8567116260528564, + "num_tokens": 118597123.0, + "step": 3106 + }, + { + "epoch": 0.3952423355807149, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.796014666557312, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8516968488693237, + "num_tokens": 118636683.0, + "step": 3107 + }, + { + "epoch": 0.39536954585930545, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 2.1010847091674805, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8604662418365479, + "num_tokens": 118665770.0, + "step": 3108 + }, + { + "epoch": 0.3954967561378959, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8999766111373901, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8665130734443665, + "num_tokens": 118701652.0, + "step": 3109 + }, + { + "epoch": 0.39562396641648645, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8190947771072388, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8629159927368164, + "num_tokens": 118741756.0, + "step": 3110 + }, + { + "epoch": 0.395751176695077, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.7866418361663818, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8571416139602661, + "num_tokens": 118784320.0, + "step": 3111 + }, + { + "epoch": 0.39587838697366745, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8554061651229858, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8514139652252197, + "num_tokens": 118821456.0, + "step": 3112 + }, + { + "epoch": 0.396005597252258, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.9179131984710693, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8549497127532959, + "num_tokens": 118861746.0, + "step": 3113 + }, + { + "epoch": 0.3961328075308485, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8644146919250488, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8617947697639465, + "num_tokens": 118900346.0, + "step": 3114 + }, + { + "epoch": 0.396260017809439, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.8377453088760376, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8573274612426758, + "num_tokens": 118939707.0, + "step": 3115 + }, + { + "epoch": 0.3963872280880295, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.895239233970642, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8560131788253784, + "num_tokens": 118971154.0, + "step": 3116 + }, + { + "epoch": 0.39651443836662004, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.861431360244751, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8412166833877563, + "num_tokens": 119012344.0, + "step": 3117 + }, + { + "epoch": 0.3966416486452105, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 2.1104447841644287, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8554010987281799, + "num_tokens": 119047677.0, + "step": 3118 + }, + { + "epoch": 0.39676885892380104, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.8917888402938843, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8701033592224121, + "num_tokens": 119086357.0, + "step": 3119 + }, + { + "epoch": 0.39689606920239157, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.966792345046997, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8643197417259216, + "num_tokens": 119122854.0, + "step": 3120 + }, + { + "epoch": 0.39702327948098204, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9319871664047241, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8648421168327332, + "num_tokens": 119155449.0, + "step": 3121 + }, + { + "epoch": 0.39715048975957257, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.8233699798583984, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8512499332427979, + "num_tokens": 119197287.0, + "step": 3122 + }, + { + "epoch": 0.3972777000381631, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9876059293746948, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8444619178771973, + "num_tokens": 119235749.0, + "step": 3123 + }, + { + "epoch": 0.39740491031675357, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.985780119895935, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8534054756164551, + "num_tokens": 119267282.0, + "step": 3124 + }, + { + "epoch": 0.3975321205953441, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.880078911781311, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8495556116104126, + "num_tokens": 119304387.0, + "step": 3125 + }, + { + "epoch": 0.3976593308739346, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9295839071273804, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8462623953819275, + "num_tokens": 119341470.0, + "step": 3126 + }, + { + "epoch": 0.3977865411525251, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9179000854492188, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8455517292022705, + "num_tokens": 119381067.0, + "step": 3127 + }, + { + "epoch": 0.3979137514311156, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9090602397918701, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8562540411949158, + "num_tokens": 119415680.0, + "step": 3128 + }, + { + "epoch": 0.39804096170970615, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.0373446941375732, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8469996452331543, + "num_tokens": 119454300.0, + "step": 3129 + }, + { + "epoch": 0.3981681719882967, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.011937141418457, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8689446449279785, + "num_tokens": 119485599.0, + "step": 3130 + }, + { + "epoch": 0.39829538226688715, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.0961971282958984, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8417635560035706, + "num_tokens": 119525820.0, + "step": 3131 + }, + { + "epoch": 0.3984225925454777, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.8629896640777588, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8623306751251221, + "num_tokens": 119568216.0, + "step": 3132 + }, + { + "epoch": 0.3985498028240682, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9841091632843018, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8430663347244263, + "num_tokens": 119602914.0, + "step": 3133 + }, + { + "epoch": 0.3986770131026587, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9441226720809937, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8529098629951477, + "num_tokens": 119637024.0, + "step": 3134 + }, + { + "epoch": 0.3988042233812492, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9050852060317993, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.848132848739624, + "num_tokens": 119679265.0, + "step": 3135 + }, + { + "epoch": 0.39893143365983974, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.98048734664917, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8473684787750244, + "num_tokens": 119716254.0, + "step": 3136 + }, + { + "epoch": 0.3990586439384302, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.7599717378616333, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8497304916381836, + "num_tokens": 119761325.0, + "step": 3137 + }, + { + "epoch": 0.39918585421702074, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9535235166549683, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8789464235305786, + "num_tokens": 119795468.0, + "step": 3138 + }, + { + "epoch": 0.39931306449561127, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9195760488510132, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8460825681686401, + "num_tokens": 119833804.0, + "step": 3139 + }, + { + "epoch": 0.39944027477420174, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.8236082792282104, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8521039485931396, + "num_tokens": 119870852.0, + "step": 3140 + }, + { + "epoch": 0.39956748505279227, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.8626197576522827, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.859125018119812, + "num_tokens": 119909258.0, + "step": 3141 + }, + { + "epoch": 0.3996946953313828, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.0300378799438477, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8440144658088684, + "num_tokens": 119940701.0, + "step": 3142 + }, + { + "epoch": 0.39982190560997327, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.021592617034912, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8575440049171448, + "num_tokens": 119976780.0, + "step": 3143 + }, + { + "epoch": 0.3999491158885638, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9636478424072266, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8491014242172241, + "num_tokens": 120017543.0, + "step": 3144 + }, + { + "epoch": 0.40007632616715433, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.027280807495117, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8528766632080078, + "num_tokens": 120050036.0, + "step": 3145 + }, + { + "epoch": 0.4002035364457448, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.0952625274658203, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8699459433555603, + "num_tokens": 120084582.0, + "step": 3146 + }, + { + "epoch": 0.40033074672433533, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.0252525806427, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.851717472076416, + "num_tokens": 120120218.0, + "step": 3147 + }, + { + "epoch": 0.40045795700292586, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.7207640409469604, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8765556812286377, + "num_tokens": 120161650.0, + "step": 3148 + }, + { + "epoch": 0.40058516728151633, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.835471749305725, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8648925423622131, + "num_tokens": 120197409.0, + "step": 3149 + }, + { + "epoch": 0.40071237756010686, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.825558066368103, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8436810970306396, + "num_tokens": 120240700.0, + "step": 3150 + }, + { + "epoch": 0.4008395878386974, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.715336799621582, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8463393449783325, + "num_tokens": 120286741.0, + "step": 3151 + }, + { + "epoch": 0.40096679811728786, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9431140422821045, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8582149147987366, + "num_tokens": 120323236.0, + "step": 3152 + }, + { + "epoch": 0.4010940083958784, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9904643297195435, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8553034067153931, + "num_tokens": 120362185.0, + "step": 3153 + }, + { + "epoch": 0.4012212186744689, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9771474599838257, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8481724262237549, + "num_tokens": 120397524.0, + "step": 3154 + }, + { + "epoch": 0.4013484289530594, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.8988784551620483, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.859106183052063, + "num_tokens": 120439723.0, + "step": 3155 + }, + { + "epoch": 0.4014756392316499, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8097960948944092, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.859404444694519, + "num_tokens": 120483693.0, + "step": 3156 + }, + { + "epoch": 0.40160284951024044, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.7925610542297363, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.866489052772522, + "num_tokens": 120520558.0, + "step": 3157 + }, + { + "epoch": 0.4017300597888309, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8017702102661133, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8648914694786072, + "num_tokens": 120557901.0, + "step": 3158 + }, + { + "epoch": 0.40185727006742145, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.151836633682251, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.863254725933075, + "num_tokens": 120593851.0, + "step": 3159 + }, + { + "epoch": 0.401984480346012, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.3640077114105225, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8486576080322266, + "num_tokens": 120624533.0, + "step": 3160 + }, + { + "epoch": 0.40211169062460245, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.0954997539520264, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8515023589134216, + "num_tokens": 120660820.0, + "step": 3161 + }, + { + "epoch": 0.402238900903193, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.7821162939071655, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8688980937004089, + "num_tokens": 120701485.0, + "step": 3162 + }, + { + "epoch": 0.4023661111817835, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9901385307312012, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8651574850082397, + "num_tokens": 120736677.0, + "step": 3163 + }, + { + "epoch": 0.402493321460374, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.0642144680023193, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8540884256362915, + "num_tokens": 120773933.0, + "step": 3164 + }, + { + "epoch": 0.4026205317389645, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.0286693572998047, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8618508577346802, + "num_tokens": 120804671.0, + "step": 3165 + }, + { + "epoch": 0.40274774201755503, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9529603719711304, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8607882261276245, + "num_tokens": 120837042.0, + "step": 3166 + }, + { + "epoch": 0.4028749522961455, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.7906956672668457, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8533308506011963, + "num_tokens": 120875127.0, + "step": 3167 + }, + { + "epoch": 0.40300216257473603, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.1431219577789307, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8547782897949219, + "num_tokens": 120915937.0, + "step": 3168 + }, + { + "epoch": 0.40312937285332656, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9711527824401855, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8416726589202881, + "num_tokens": 120953334.0, + "step": 3169 + }, + { + "epoch": 0.40325658313191703, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9525394439697266, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8570648431777954, + "num_tokens": 120986674.0, + "step": 3170 + }, + { + "epoch": 0.40338379341050756, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.72980535030365, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8621980547904968, + "num_tokens": 121026305.0, + "step": 3171 + }, + { + "epoch": 0.4035110036890981, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9060760736465454, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8694753050804138, + "num_tokens": 121061644.0, + "step": 3172 + }, + { + "epoch": 0.40363821396768856, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.718314528465271, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8528404235839844, + "num_tokens": 121102860.0, + "step": 3173 + }, + { + "epoch": 0.4037654242462791, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8734256029129028, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8689593076705933, + "num_tokens": 121136566.0, + "step": 3174 + }, + { + "epoch": 0.4038926345248696, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9041270017623901, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8662802577018738, + "num_tokens": 121173759.0, + "step": 3175 + }, + { + "epoch": 0.4040198448034601, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.787419080734253, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8569965958595276, + "num_tokens": 121219881.0, + "step": 3176 + }, + { + "epoch": 0.4041470550820506, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8458881378173828, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8560070991516113, + "num_tokens": 121258719.0, + "step": 3177 + }, + { + "epoch": 0.40427426536064115, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8617762327194214, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8345907926559448, + "num_tokens": 121298013.0, + "step": 3178 + }, + { + "epoch": 0.4044014756392316, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9787085056304932, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.86814284324646, + "num_tokens": 121329615.0, + "step": 3179 + }, + { + "epoch": 0.40452868591782215, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.7835949659347534, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8668162226676941, + "num_tokens": 121365199.0, + "step": 3180 + }, + { + "epoch": 0.4046558961964127, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.0964925289154053, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8519194722175598, + "num_tokens": 121402393.0, + "step": 3181 + }, + { + "epoch": 0.4047831064750032, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8371134996414185, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8570622801780701, + "num_tokens": 121439678.0, + "step": 3182 + }, + { + "epoch": 0.4049103167535937, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.0889415740966797, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8513521552085876, + "num_tokens": 121480781.0, + "step": 3183 + }, + { + "epoch": 0.4050375270321842, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.7659289836883545, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.863659679889679, + "num_tokens": 121517614.0, + "step": 3184 + }, + { + "epoch": 0.40516473731077474, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.0205042362213135, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8538880944252014, + "num_tokens": 121552225.0, + "step": 3185 + }, + { + "epoch": 0.4052919475893652, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9302546977996826, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8526259064674377, + "num_tokens": 121587177.0, + "step": 3186 + }, + { + "epoch": 0.40541915786795574, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.031839370727539, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8381990194320679, + "num_tokens": 121619547.0, + "step": 3187 + }, + { + "epoch": 0.40554636814654627, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 6.406662464141846, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8634299039840698, + "num_tokens": 121657396.0, + "step": 3188 + }, + { + "epoch": 0.40567357842513674, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.3443164825439453, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8407498598098755, + "num_tokens": 121691867.0, + "step": 3189 + }, + { + "epoch": 0.40580078870372727, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.1909005641937256, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8511328101158142, + "num_tokens": 121723546.0, + "step": 3190 + }, + { + "epoch": 0.4059279989823178, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.15750789642334, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8561389446258545, + "num_tokens": 121764589.0, + "step": 3191 + }, + { + "epoch": 0.40605520926090827, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9748975038528442, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8462456464767456, + "num_tokens": 121803281.0, + "step": 3192 + }, + { + "epoch": 0.4061824195394988, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.1401376724243164, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8446018099784851, + "num_tokens": 121841710.0, + "step": 3193 + }, + { + "epoch": 0.4063096298180893, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8970354795455933, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8555120825767517, + "num_tokens": 121881979.0, + "step": 3194 + }, + { + "epoch": 0.4064368400966798, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8737680912017822, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8601256608963013, + "num_tokens": 121921599.0, + "step": 3195 + }, + { + "epoch": 0.4065640503752703, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.7984408140182495, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8654851913452148, + "num_tokens": 121960545.0, + "step": 3196 + }, + { + "epoch": 0.40669126065386085, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8073718547821045, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8369103074073792, + "num_tokens": 121999259.0, + "step": 3197 + }, + { + "epoch": 0.4068184709324513, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8434929847717285, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8692925572395325, + "num_tokens": 122039066.0, + "step": 3198 + }, + { + "epoch": 0.40694568121104185, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.6438324451446533, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8648777008056641, + "num_tokens": 122085035.0, + "step": 3199 + }, + { + "epoch": 0.4070728914896324, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.7138875722885132, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8690450191497803, + "num_tokens": 122128007.0, + "step": 3200 + }, + { + "epoch": 0.40720010176822286, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9137296676635742, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8547472953796387, + "num_tokens": 122169623.0, + "step": 3201 + }, + { + "epoch": 0.4073273120468134, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.018950939178467, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8512752056121826, + "num_tokens": 122201755.0, + "step": 3202 + }, + { + "epoch": 0.4074545223254039, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.0633795261383057, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8667466640472412, + "num_tokens": 122233680.0, + "step": 3203 + }, + { + "epoch": 0.4075817326039944, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.077948808670044, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8537370562553406, + "num_tokens": 122265107.0, + "step": 3204 + }, + { + "epoch": 0.4077089428825849, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9029991626739502, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8618981838226318, + "num_tokens": 122302750.0, + "step": 3205 + }, + { + "epoch": 0.40783615316117544, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.7616655826568604, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8480297327041626, + "num_tokens": 122346820.0, + "step": 3206 + }, + { + "epoch": 0.4079633634397659, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9140864610671997, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8613519668579102, + "num_tokens": 122383254.0, + "step": 3207 + }, + { + "epoch": 0.40809057371835644, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9587589502334595, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8687070608139038, + "num_tokens": 122416693.0, + "step": 3208 + }, + { + "epoch": 0.40821778399694697, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.001007080078125, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8623961210250854, + "num_tokens": 122452403.0, + "step": 3209 + }, + { + "epoch": 0.40834499427553744, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.884093165397644, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8456847667694092, + "num_tokens": 122489685.0, + "step": 3210 + }, + { + "epoch": 0.40847220455412797, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9380348920822144, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8396289944648743, + "num_tokens": 122529079.0, + "step": 3211 + }, + { + "epoch": 0.4085994148327185, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.7332544326782227, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8502063751220703, + "num_tokens": 122569933.0, + "step": 3212 + }, + { + "epoch": 0.40872662511130897, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9287290573120117, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8370321393013, + "num_tokens": 122607884.0, + "step": 3213 + }, + { + "epoch": 0.4088538353898995, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.00876522064209, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8574855327606201, + "num_tokens": 122651968.0, + "step": 3214 + }, + { + "epoch": 0.40898104566849003, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.024364471435547, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8605948686599731, + "num_tokens": 122688680.0, + "step": 3215 + }, + { + "epoch": 0.4091082559470805, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.7837896347045898, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.861736536026001, + "num_tokens": 122728271.0, + "step": 3216 + }, + { + "epoch": 0.40923546622567103, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9322112798690796, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8560755848884583, + "num_tokens": 122759691.0, + "step": 3217 + }, + { + "epoch": 0.40936267650426156, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9610687494277954, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8541904091835022, + "num_tokens": 122800185.0, + "step": 3218 + }, + { + "epoch": 0.40948988678285203, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8747692108154297, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8658485412597656, + "num_tokens": 122839548.0, + "step": 3219 + }, + { + "epoch": 0.40961709706144256, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.810930609703064, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8427481055259705, + "num_tokens": 122879621.0, + "step": 3220 + }, + { + "epoch": 0.4097443073400331, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9073984622955322, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8506216406822205, + "num_tokens": 122921495.0, + "step": 3221 + }, + { + "epoch": 0.40987151761862356, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8876029253005981, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8644921779632568, + "num_tokens": 122957197.0, + "step": 3222 + }, + { + "epoch": 0.4099987278972141, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9558982849121094, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8391584157943726, + "num_tokens": 122999999.0, + "step": 3223 + }, + { + "epoch": 0.4101259381758046, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.975795865058899, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8614892363548279, + "num_tokens": 123040402.0, + "step": 3224 + }, + { + "epoch": 0.4102531484543951, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8287858963012695, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8602754473686218, + "num_tokens": 123080086.0, + "step": 3225 + }, + { + "epoch": 0.4103803587329856, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9564274549484253, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8582314848899841, + "num_tokens": 123120058.0, + "step": 3226 + }, + { + "epoch": 0.41050756901157615, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.7197612524032593, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8580513596534729, + "num_tokens": 123162003.0, + "step": 3227 + }, + { + "epoch": 0.4106347792901666, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8945748805999756, + "learning_rate": 1e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.824583113193512, + "num_tokens": 123202249.0, + "step": 3228 + }, + { + "epoch": 0.41076198956875715, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9066858291625977, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.850455641746521, + "num_tokens": 123237401.0, + "step": 3229 + }, + { + "epoch": 0.4108891998473477, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.0186569690704346, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8726100921630859, + "num_tokens": 123269364.0, + "step": 3230 + }, + { + "epoch": 0.4110164101259382, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.109865427017212, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8604662418365479, + "num_tokens": 123307743.0, + "step": 3231 + }, + { + "epoch": 0.4111436204045287, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.0675477981567383, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8621867895126343, + "num_tokens": 123340102.0, + "step": 3232 + }, + { + "epoch": 0.4112708306831192, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.1053147315979004, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8518965244293213, + "num_tokens": 123374956.0, + "step": 3233 + }, + { + "epoch": 0.41139804096170973, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.0372822284698486, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.865607738494873, + "num_tokens": 123418070.0, + "step": 3234 + }, + { + "epoch": 0.4115252512403002, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.1094472408294678, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8536254167556763, + "num_tokens": 123453183.0, + "step": 3235 + }, + { + "epoch": 0.41165246151889073, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9697070121765137, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8580716848373413, + "num_tokens": 123491526.0, + "step": 3236 + }, + { + "epoch": 0.41177967179748126, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9608709812164307, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8579409718513489, + "num_tokens": 123529743.0, + "step": 3237 + }, + { + "epoch": 0.41190688207607173, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.999853253364563, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8593516945838928, + "num_tokens": 123573402.0, + "step": 3238 + }, + { + "epoch": 0.41203409235466226, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.4400055408477783, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8541211485862732, + "num_tokens": 123604167.0, + "step": 3239 + }, + { + "epoch": 0.4121613026332528, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.755808711051941, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8537139296531677, + "num_tokens": 123648104.0, + "step": 3240 + }, + { + "epoch": 0.41228851291184326, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8715968132019043, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8467381000518799, + "num_tokens": 123689678.0, + "step": 3241 + }, + { + "epoch": 0.4124157231904338, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.052704095840454, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8645485639572144, + "num_tokens": 123729257.0, + "step": 3242 + }, + { + "epoch": 0.4125429334690243, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8127697706222534, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8579860925674438, + "num_tokens": 123772251.0, + "step": 3243 + }, + { + "epoch": 0.4126701437476148, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8739111423492432, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8694243431091309, + "num_tokens": 123810345.0, + "step": 3244 + }, + { + "epoch": 0.4127973540262053, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.843614101409912, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8533642292022705, + "num_tokens": 123855276.0, + "step": 3245 + }, + { + "epoch": 0.41292456430479585, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.170494556427002, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8599053025245667, + "num_tokens": 123896613.0, + "step": 3246 + }, + { + "epoch": 0.4130517745833863, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.920888900756836, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8404753804206848, + "num_tokens": 123940713.0, + "step": 3247 + }, + { + "epoch": 0.41317898486197685, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8890122175216675, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8466277122497559, + "num_tokens": 123981485.0, + "step": 3248 + }, + { + "epoch": 0.4133061951405674, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8595032691955566, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8572824001312256, + "num_tokens": 124018159.0, + "step": 3249 + }, + { + "epoch": 0.41343340541915785, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.04506778717041, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8662289977073669, + "num_tokens": 124049911.0, + "step": 3250 + }, + { + "epoch": 0.4135606156977484, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.0503737926483154, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8779053092002869, + "num_tokens": 124084260.0, + "step": 3251 + }, + { + "epoch": 0.4136878259763389, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9252309799194336, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8594906330108643, + "num_tokens": 124122741.0, + "step": 3252 + }, + { + "epoch": 0.4138150362549294, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8225523233413696, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8543248772621155, + "num_tokens": 124162793.0, + "step": 3253 + }, + { + "epoch": 0.4139422465335199, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.9517865180969238, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8497616052627563, + "num_tokens": 124206889.0, + "step": 3254 + }, + { + "epoch": 0.41406945681211044, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.89792799949646, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8674322366714478, + "num_tokens": 124244551.0, + "step": 3255 + }, + { + "epoch": 0.4141966670907009, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9753973484039307, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8594539165496826, + "num_tokens": 124290485.0, + "step": 3256 + }, + { + "epoch": 0.41432387736929144, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.024071216583252, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8650548458099365, + "num_tokens": 124323763.0, + "step": 3257 + }, + { + "epoch": 0.41445108764788197, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9059518575668335, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8569189310073853, + "num_tokens": 124363318.0, + "step": 3258 + }, + { + "epoch": 0.41457829792647244, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.302924156188965, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8786444664001465, + "num_tokens": 124400845.0, + "step": 3259 + }, + { + "epoch": 0.41470550820506297, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.7102266550064087, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8651398420333862, + "num_tokens": 124442546.0, + "step": 3260 + }, + { + "epoch": 0.4148327184836535, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.976956844329834, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8530320525169373, + "num_tokens": 124477737.0, + "step": 3261 + }, + { + "epoch": 0.41495992876224397, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9756816625595093, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8440501689910889, + "num_tokens": 124518081.0, + "step": 3262 + }, + { + "epoch": 0.4150871390408345, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8480889797210693, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8472451567649841, + "num_tokens": 124556361.0, + "step": 3263 + }, + { + "epoch": 0.415214349319425, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9583617448806763, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8558385372161865, + "num_tokens": 124588490.0, + "step": 3264 + }, + { + "epoch": 0.4153415595980155, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.8477174043655396, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8567473888397217, + "num_tokens": 124624656.0, + "step": 3265 + }, + { + "epoch": 0.415468769876606, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.0939385890960693, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8421323299407959, + "num_tokens": 124660681.0, + "step": 3266 + }, + { + "epoch": 0.41559598015519655, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.32013201713562, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.857158899307251, + "num_tokens": 124703448.0, + "step": 3267 + }, + { + "epoch": 0.415723190433787, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 16.59528160095215, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8555454015731812, + "num_tokens": 124742078.0, + "step": 3268 + }, + { + "epoch": 0.41585040071237755, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.1688425540924072, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8451958894729614, + "num_tokens": 124779894.0, + "step": 3269 + }, + { + "epoch": 0.4159776109909681, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8698747158050537, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8521000742912292, + "num_tokens": 124817918.0, + "step": 3270 + }, + { + "epoch": 0.41610482126955856, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8448749780654907, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8730889558792114, + "num_tokens": 124854839.0, + "step": 3271 + }, + { + "epoch": 0.4162320315481491, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.058429718017578, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8411968946456909, + "num_tokens": 124886779.0, + "step": 3272 + }, + { + "epoch": 0.4163592418267396, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9091862440109253, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8645994663238525, + "num_tokens": 124927919.0, + "step": 3273 + }, + { + "epoch": 0.4164864521053301, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8912514448165894, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8547130823135376, + "num_tokens": 124966475.0, + "step": 3274 + }, + { + "epoch": 0.4166136623839206, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9613652229309082, + "learning_rate": 1e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.831895649433136, + "num_tokens": 125004110.0, + "step": 3275 + }, + { + "epoch": 0.41674087266251114, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.7572605609893799, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8702515959739685, + "num_tokens": 125043164.0, + "step": 3276 + }, + { + "epoch": 0.4168680829411016, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.7718995809555054, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8606125712394714, + "num_tokens": 125084255.0, + "step": 3277 + }, + { + "epoch": 0.41699529321969214, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.723440170288086, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8699549436569214, + "num_tokens": 125124971.0, + "step": 3278 + }, + { + "epoch": 0.41712250349828267, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9224870204925537, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8529804348945618, + "num_tokens": 125161470.0, + "step": 3279 + }, + { + "epoch": 0.4172497137768732, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.874878168106079, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8406103253364563, + "num_tokens": 125199117.0, + "step": 3280 + }, + { + "epoch": 0.41737692405546367, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.927332878112793, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8379062414169312, + "num_tokens": 125238136.0, + "step": 3281 + }, + { + "epoch": 0.4175041343340542, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.794917106628418, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.854128360748291, + "num_tokens": 125281041.0, + "step": 3282 + }, + { + "epoch": 0.41763134461264473, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9459195137023926, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8582693338394165, + "num_tokens": 125314739.0, + "step": 3283 + }, + { + "epoch": 0.4177585548912352, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.079636573791504, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8505522012710571, + "num_tokens": 125353129.0, + "step": 3284 + }, + { + "epoch": 0.41788576516982573, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.945381999015808, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8544100522994995, + "num_tokens": 125394092.0, + "step": 3285 + }, + { + "epoch": 0.41801297544841626, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.983564853668213, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8514026999473572, + "num_tokens": 125438262.0, + "step": 3286 + }, + { + "epoch": 0.41814018572700673, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.934041976928711, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8559627532958984, + "num_tokens": 125477842.0, + "step": 3287 + }, + { + "epoch": 0.41826739600559726, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9401413202285767, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.867150604724884, + "num_tokens": 125517236.0, + "step": 3288 + }, + { + "epoch": 0.4183946062841878, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.724687099456787, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8498128056526184, + "num_tokens": 125561598.0, + "step": 3289 + }, + { + "epoch": 0.41852181656277826, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.994645118713379, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8536968231201172, + "num_tokens": 125599104.0, + "step": 3290 + }, + { + "epoch": 0.4186490268413688, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.0342721939086914, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8564407229423523, + "num_tokens": 125635286.0, + "step": 3291 + }, + { + "epoch": 0.4187762371199593, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.036262273788452, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8610014319419861, + "num_tokens": 125669534.0, + "step": 3292 + }, + { + "epoch": 0.4189034473985498, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.008683681488037, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8552149534225464, + "num_tokens": 125707966.0, + "step": 3293 + }, + { + "epoch": 0.4190306576771403, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.0398001670837402, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8507336974143982, + "num_tokens": 125741448.0, + "step": 3294 + }, + { + "epoch": 0.41915786795573085, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.0538570880889893, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8501570224761963, + "num_tokens": 125776887.0, + "step": 3295 + }, + { + "epoch": 0.4192850782343213, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.0357506275177, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8615756034851074, + "num_tokens": 125807899.0, + "step": 3296 + }, + { + "epoch": 0.41941228851291185, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8599376678466797, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.863561749458313, + "num_tokens": 125847711.0, + "step": 3297 + }, + { + "epoch": 0.4195394987915024, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.102294683456421, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8695030808448792, + "num_tokens": 125883248.0, + "step": 3298 + }, + { + "epoch": 0.41966670907009285, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.84260892868042, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8453947305679321, + "num_tokens": 125925771.0, + "step": 3299 + }, + { + "epoch": 0.4197939193486834, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9288218021392822, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8441930413246155, + "num_tokens": 125968815.0, + "step": 3300 + }, + { + "epoch": 0.4199211296272739, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.7680763006210327, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8506312370300293, + "num_tokens": 126007761.0, + "step": 3301 + }, + { + "epoch": 0.4200483399058644, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9479964971542358, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8375685811042786, + "num_tokens": 126047872.0, + "step": 3302 + }, + { + "epoch": 0.4201755501844549, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8859232664108276, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8624101281166077, + "num_tokens": 126081273.0, + "step": 3303 + }, + { + "epoch": 0.42030276046304543, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.004685640335083, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8409444093704224, + "num_tokens": 126121590.0, + "step": 3304 + }, + { + "epoch": 0.4204299707416359, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8289132118225098, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8391329050064087, + "num_tokens": 126168396.0, + "step": 3305 + }, + { + "epoch": 0.42055718102022643, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9426305294036865, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8512899875640869, + "num_tokens": 126205220.0, + "step": 3306 + }, + { + "epoch": 0.42068439129881696, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9617791175842285, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8505194187164307, + "num_tokens": 126239131.0, + "step": 3307 + }, + { + "epoch": 0.42081160157740743, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9671151638031006, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8586940765380859, + "num_tokens": 126271931.0, + "step": 3308 + }, + { + "epoch": 0.42093881185599796, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8605339527130127, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8641601204872131, + "num_tokens": 126307730.0, + "step": 3309 + }, + { + "epoch": 0.4210660221345885, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.0764498710632324, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8580060601234436, + "num_tokens": 126344528.0, + "step": 3310 + }, + { + "epoch": 0.42119323241317896, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9011276960372925, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8645573258399963, + "num_tokens": 126377260.0, + "step": 3311 + }, + { + "epoch": 0.4213204426917695, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9376468658447266, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8516767024993896, + "num_tokens": 126409723.0, + "step": 3312 + }, + { + "epoch": 0.42144765297036, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.8584810495376587, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8608577251434326, + "num_tokens": 126450929.0, + "step": 3313 + }, + { + "epoch": 0.4215748632489505, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.008402109146118, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8656702041625977, + "num_tokens": 126486878.0, + "step": 3314 + }, + { + "epoch": 0.421702073527541, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.121619462966919, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8436069488525391, + "num_tokens": 126522884.0, + "step": 3315 + }, + { + "epoch": 0.42182928380613155, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.0103514194488525, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8578987121582031, + "num_tokens": 126560848.0, + "step": 3316 + }, + { + "epoch": 0.421956494084722, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.982880711555481, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8578979969024658, + "num_tokens": 126592947.0, + "step": 3317 + }, + { + "epoch": 0.42208370436331255, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.8740408420562744, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8667699098587036, + "num_tokens": 126630631.0, + "step": 3318 + }, + { + "epoch": 0.4222109146419031, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.967809796333313, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8528620004653931, + "num_tokens": 126673395.0, + "step": 3319 + }, + { + "epoch": 0.42233812492049355, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.8967254161834717, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8717577457427979, + "num_tokens": 126711504.0, + "step": 3320 + }, + { + "epoch": 0.4224653351990841, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.7049189805984497, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.86870276927948, + "num_tokens": 126753217.0, + "step": 3321 + }, + { + "epoch": 0.4225925454776746, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.7092876434326172, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8621456623077393, + "num_tokens": 126792982.0, + "step": 3322 + }, + { + "epoch": 0.4227197557562651, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.0135467052459717, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.850138783454895, + "num_tokens": 126823771.0, + "step": 3323 + }, + { + "epoch": 0.4228469660348556, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8493214845657349, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8599603176116943, + "num_tokens": 126867137.0, + "step": 3324 + }, + { + "epoch": 0.42297417631344614, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9998255968093872, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8620893955230713, + "num_tokens": 126902680.0, + "step": 3325 + }, + { + "epoch": 0.4231013865920366, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8868921995162964, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8680635690689087, + "num_tokens": 126941037.0, + "step": 3326 + }, + { + "epoch": 0.42322859687062714, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8646129369735718, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8598307967185974, + "num_tokens": 126980009.0, + "step": 3327 + }, + { + "epoch": 0.42335580714921767, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9072778224945068, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8693045973777771, + "num_tokens": 127020022.0, + "step": 3328 + }, + { + "epoch": 0.42348301742780814, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.849117636680603, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8379395008087158, + "num_tokens": 127060500.0, + "step": 3329 + }, + { + "epoch": 0.42361022770639867, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.0041067600250244, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8535847067832947, + "num_tokens": 127095886.0, + "step": 3330 + }, + { + "epoch": 0.4237374379849892, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.011902332305908, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.847585141658783, + "num_tokens": 127134334.0, + "step": 3331 + }, + { + "epoch": 0.4238646482635797, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.962586522102356, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8491158485412598, + "num_tokens": 127174288.0, + "step": 3332 + }, + { + "epoch": 0.4239918585421702, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9755979776382446, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8535120487213135, + "num_tokens": 127210311.0, + "step": 3333 + }, + { + "epoch": 0.4241190688207607, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8704746961593628, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8784329295158386, + "num_tokens": 127252813.0, + "step": 3334 + }, + { + "epoch": 0.42424627909935125, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.104111671447754, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8458341360092163, + "num_tokens": 127285847.0, + "step": 3335 + }, + { + "epoch": 0.4243734893779417, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.0924527645111084, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.871105432510376, + "num_tokens": 127325360.0, + "step": 3336 + }, + { + "epoch": 0.42450069965653225, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8813809156417847, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8608540296554565, + "num_tokens": 127366446.0, + "step": 3337 + }, + { + "epoch": 0.4246279099351228, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.863692045211792, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8669235110282898, + "num_tokens": 127402608.0, + "step": 3338 + }, + { + "epoch": 0.42475512021371326, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.1016604900360107, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8627632856369019, + "num_tokens": 127445355.0, + "step": 3339 + }, + { + "epoch": 0.4248823304923038, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9931578636169434, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8509093523025513, + "num_tokens": 127480523.0, + "step": 3340 + }, + { + "epoch": 0.4250095407708943, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8336430788040161, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8764665722846985, + "num_tokens": 127520787.0, + "step": 3341 + }, + { + "epoch": 0.4251367510494848, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8484688997268677, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8526633977890015, + "num_tokens": 127565576.0, + "step": 3342 + }, + { + "epoch": 0.4252639613280753, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.038867712020874, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8683636784553528, + "num_tokens": 127599144.0, + "step": 3343 + }, + { + "epoch": 0.42539117160666584, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9454755783081055, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8749747276306152, + "num_tokens": 127637361.0, + "step": 3344 + }, + { + "epoch": 0.4255183818852563, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9689606428146362, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8473010063171387, + "num_tokens": 127675139.0, + "step": 3345 + }, + { + "epoch": 0.42564559216384684, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.742222547531128, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.863757848739624, + "num_tokens": 127718822.0, + "step": 3346 + }, + { + "epoch": 0.42577280244243737, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.6970869302749634, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8609740734100342, + "num_tokens": 127763332.0, + "step": 3347 + }, + { + "epoch": 0.42590001272102784, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.7939625978469849, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8687059283256531, + "num_tokens": 127800346.0, + "step": 3348 + }, + { + "epoch": 0.42602722299961837, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.8705302476882935, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8545539379119873, + "num_tokens": 127845142.0, + "step": 3349 + }, + { + "epoch": 0.4261544332782089, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.0009217262268066, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8406009674072266, + "num_tokens": 127881131.0, + "step": 3350 + }, + { + "epoch": 0.4262816435567994, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.8773424625396729, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8711333274841309, + "num_tokens": 127917941.0, + "step": 3351 + }, + { + "epoch": 0.4264088538353899, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.9221818447113037, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.844456136226654, + "num_tokens": 127963345.0, + "step": 3352 + }, + { + "epoch": 0.42653606411398043, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.7650246620178223, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8665229082107544, + "num_tokens": 128004839.0, + "step": 3353 + }, + { + "epoch": 0.4266632743925709, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8820905685424805, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.851658284664154, + "num_tokens": 128042709.0, + "step": 3354 + }, + { + "epoch": 0.42679048467116143, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9555014371871948, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8422215580940247, + "num_tokens": 128085069.0, + "step": 3355 + }, + { + "epoch": 0.42691769494975196, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.972068428993225, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8521969318389893, + "num_tokens": 128124300.0, + "step": 3356 + }, + { + "epoch": 0.42704490522834243, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8394988775253296, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8549258708953857, + "num_tokens": 128164410.0, + "step": 3357 + }, + { + "epoch": 0.42717211550693296, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.7417659759521484, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8682529926300049, + "num_tokens": 128206524.0, + "step": 3358 + }, + { + "epoch": 0.4272993257855235, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8945800065994263, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8486464023590088, + "num_tokens": 128246387.0, + "step": 3359 + }, + { + "epoch": 0.42742653606411396, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8688400983810425, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8672546744346619, + "num_tokens": 128287794.0, + "step": 3360 + }, + { + "epoch": 0.4275537463427045, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9233568906784058, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8547043800354004, + "num_tokens": 128328467.0, + "step": 3361 + }, + { + "epoch": 0.427680956621295, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8747012615203857, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8662564158439636, + "num_tokens": 128364319.0, + "step": 3362 + }, + { + "epoch": 0.4278081668998855, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.026914596557617, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.851578950881958, + "num_tokens": 128403892.0, + "step": 3363 + }, + { + "epoch": 0.427935377178476, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.7457548379898071, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8378732204437256, + "num_tokens": 128446760.0, + "step": 3364 + }, + { + "epoch": 0.42806258745706655, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.0540754795074463, + "learning_rate": 1e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8322561383247375, + "num_tokens": 128485498.0, + "step": 3365 + }, + { + "epoch": 0.428189797735657, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.9441777467727661, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8453812599182129, + "num_tokens": 128527871.0, + "step": 3366 + }, + { + "epoch": 0.42831700801424755, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.8067706823349, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8428645133972168, + "num_tokens": 128567792.0, + "step": 3367 + }, + { + "epoch": 0.4284442182928381, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.8161412477493286, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8456076979637146, + "num_tokens": 128607217.0, + "step": 3368 + }, + { + "epoch": 0.42857142857142855, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.9687223434448242, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.862910270690918, + "num_tokens": 128642787.0, + "step": 3369 + }, + { + "epoch": 0.4286986388500191, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.8726255893707275, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8681531548500061, + "num_tokens": 128680462.0, + "step": 3370 + }, + { + "epoch": 0.4288258491286096, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.9047540426254272, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8528268337249756, + "num_tokens": 128715101.0, + "step": 3371 + }, + { + "epoch": 0.4289530594072001, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.007664203643799, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8482385873794556, + "num_tokens": 128749601.0, + "step": 3372 + }, + { + "epoch": 0.4290802696857906, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.8163336515426636, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8526373505592346, + "num_tokens": 128788744.0, + "step": 3373 + }, + { + "epoch": 0.42920747996438113, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9717481136322021, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8527681827545166, + "num_tokens": 128828102.0, + "step": 3374 + }, + { + "epoch": 0.4293346902429716, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.8726885318756104, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8519707918167114, + "num_tokens": 128863610.0, + "step": 3375 + }, + { + "epoch": 0.42946190052156213, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.7980796098709106, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8711949586868286, + "num_tokens": 128898836.0, + "step": 3376 + }, + { + "epoch": 0.42958911080015266, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.8701667785644531, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8573301434516907, + "num_tokens": 128934821.0, + "step": 3377 + }, + { + "epoch": 0.42971632107874314, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.133552312850952, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8549352884292603, + "num_tokens": 128966914.0, + "step": 3378 + }, + { + "epoch": 0.42984353135733366, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.760878324508667, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8629884719848633, + "num_tokens": 129008305.0, + "step": 3379 + }, + { + "epoch": 0.4299707416359242, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9415199756622314, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8771741986274719, + "num_tokens": 129043331.0, + "step": 3380 + }, + { + "epoch": 0.4300979519145147, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.7148451805114746, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8619488477706909, + "num_tokens": 129083513.0, + "step": 3381 + }, + { + "epoch": 0.4302251621931052, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.231083869934082, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8560051918029785, + "num_tokens": 129113463.0, + "step": 3382 + }, + { + "epoch": 0.4303523724716957, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.0212764739990234, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8336243629455566, + "num_tokens": 129153080.0, + "step": 3383 + }, + { + "epoch": 0.43047958275028625, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9862090349197388, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8541469573974609, + "num_tokens": 129193338.0, + "step": 3384 + }, + { + "epoch": 0.4306067930288767, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9626280069351196, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8468093276023865, + "num_tokens": 129234816.0, + "step": 3385 + }, + { + "epoch": 0.43073400330746725, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.8191967010498047, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8600466251373291, + "num_tokens": 129275228.0, + "step": 3386 + }, + { + "epoch": 0.4308612135860578, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9654449224472046, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8380739092826843, + "num_tokens": 129309747.0, + "step": 3387 + }, + { + "epoch": 0.43098842386464825, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.036938428878784, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8593035340309143, + "num_tokens": 129342580.0, + "step": 3388 + }, + { + "epoch": 0.4311156341432388, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.1460986137390137, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8522716164588928, + "num_tokens": 129376599.0, + "step": 3389 + }, + { + "epoch": 0.4312428444218293, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.0766968727111816, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8542649745941162, + "num_tokens": 129414783.0, + "step": 3390 + }, + { + "epoch": 0.4313700547004198, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9697195291519165, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8556839227676392, + "num_tokens": 129450287.0, + "step": 3391 + }, + { + "epoch": 0.4314972649790103, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 2.0197336673736572, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8379298448562622, + "num_tokens": 129486733.0, + "step": 3392 + }, + { + "epoch": 0.43162447525760084, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.9214990139007568, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8561180830001831, + "num_tokens": 129524392.0, + "step": 3393 + }, + { + "epoch": 0.4317516855361913, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.0155441761016846, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8419791460037231, + "num_tokens": 129558452.0, + "step": 3394 + }, + { + "epoch": 0.43187889581478184, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.7957531213760376, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8550260066986084, + "num_tokens": 129597076.0, + "step": 3395 + }, + { + "epoch": 0.43200610609337237, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.0216972827911377, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8515229225158691, + "num_tokens": 129635861.0, + "step": 3396 + }, + { + "epoch": 0.43213331637196284, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8288156986236572, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8521342277526855, + "num_tokens": 129673760.0, + "step": 3397 + }, + { + "epoch": 0.43226052665055337, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.1007208824157715, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8589802384376526, + "num_tokens": 129710730.0, + "step": 3398 + }, + { + "epoch": 0.4323877369291439, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.990439534187317, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8531419634819031, + "num_tokens": 129748724.0, + "step": 3399 + }, + { + "epoch": 0.43251494720773437, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.811810851097107, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8682483434677124, + "num_tokens": 129787478.0, + "step": 3400 + }, + { + "epoch": 0.4326421574863249, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8143551349639893, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8673821091651917, + "num_tokens": 129829852.0, + "step": 3401 + }, + { + "epoch": 0.4327693677649154, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8326952457427979, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8553037643432617, + "num_tokens": 129869279.0, + "step": 3402 + }, + { + "epoch": 0.4328965780435059, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9254764318466187, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8509889841079712, + "num_tokens": 129903347.0, + "step": 3403 + }, + { + "epoch": 0.4330237883220964, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.7776087522506714, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8668319582939148, + "num_tokens": 129940028.0, + "step": 3404 + }, + { + "epoch": 0.43315099860068695, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9164832830429077, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8651010394096375, + "num_tokens": 129976438.0, + "step": 3405 + }, + { + "epoch": 0.4332782088792774, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.835412621498108, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8613492846488953, + "num_tokens": 130015698.0, + "step": 3406 + }, + { + "epoch": 0.43340541915786795, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8730711936950684, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.87176114320755, + "num_tokens": 130056000.0, + "step": 3407 + }, + { + "epoch": 0.4335326294364585, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.83307683467865, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8537108302116394, + "num_tokens": 130096990.0, + "step": 3408 + }, + { + "epoch": 0.43365983971504896, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.859929084777832, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8659021258354187, + "num_tokens": 130128556.0, + "step": 3409 + }, + { + "epoch": 0.4337870499936395, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9010145664215088, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8631287813186646, + "num_tokens": 130170607.0, + "step": 3410 + }, + { + "epoch": 0.43391426027223, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8876529932022095, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8527529835700989, + "num_tokens": 130212511.0, + "step": 3411 + }, + { + "epoch": 0.4340414705508205, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.3213541507720947, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8504443168640137, + "num_tokens": 130249246.0, + "step": 3412 + }, + { + "epoch": 0.434168680829411, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.910042643547058, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8765945434570312, + "num_tokens": 130291250.0, + "step": 3413 + }, + { + "epoch": 0.43429589110800154, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9368515014648438, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8375239372253418, + "num_tokens": 130327094.0, + "step": 3414 + }, + { + "epoch": 0.434423101386592, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8251947164535522, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8487688899040222, + "num_tokens": 130366482.0, + "step": 3415 + }, + { + "epoch": 0.43455031166518254, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.823907494544983, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8575565218925476, + "num_tokens": 130407416.0, + "step": 3416 + }, + { + "epoch": 0.43467752194377307, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.769112229347229, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8619009852409363, + "num_tokens": 130447125.0, + "step": 3417 + }, + { + "epoch": 0.43480473222236354, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9638071060180664, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8632165193557739, + "num_tokens": 130480955.0, + "step": 3418 + }, + { + "epoch": 0.43493194250095407, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.040515899658203, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.868951678276062, + "num_tokens": 130511771.0, + "step": 3419 + }, + { + "epoch": 0.4350591527795446, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8476752042770386, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8556150197982788, + "num_tokens": 130550080.0, + "step": 3420 + }, + { + "epoch": 0.4351863630581351, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.972826600074768, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8466302752494812, + "num_tokens": 130588109.0, + "step": 3421 + }, + { + "epoch": 0.4353135733367256, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9594298601150513, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8430427312850952, + "num_tokens": 130626832.0, + "step": 3422 + }, + { + "epoch": 0.43544078361531613, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9500426054000854, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8608686923980713, + "num_tokens": 130662202.0, + "step": 3423 + }, + { + "epoch": 0.4355679938939066, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.0165798664093018, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8523643612861633, + "num_tokens": 130700747.0, + "step": 3424 + }, + { + "epoch": 0.43569520417249713, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9626126289367676, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8542191982269287, + "num_tokens": 130742849.0, + "step": 3425 + }, + { + "epoch": 0.43582241445108766, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8820018768310547, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8700107932090759, + "num_tokens": 130780156.0, + "step": 3426 + }, + { + "epoch": 0.43594962472967813, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.6967542171478271, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8567901849746704, + "num_tokens": 130825824.0, + "step": 3427 + }, + { + "epoch": 0.43607683500826866, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9724323749542236, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8658774495124817, + "num_tokens": 130863603.0, + "step": 3428 + }, + { + "epoch": 0.4362040452868592, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8944200277328491, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8580694198608398, + "num_tokens": 130906689.0, + "step": 3429 + }, + { + "epoch": 0.4363312555654497, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 80.5226821899414, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8547887206077576, + "num_tokens": 130947785.0, + "step": 3430 + }, + { + "epoch": 0.4364584658440402, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1623036861419678, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.867566704750061, + "num_tokens": 130985489.0, + "step": 3431 + }, + { + "epoch": 0.4365856761226307, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.822069525718689, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8656389713287354, + "num_tokens": 131025274.0, + "step": 3432 + }, + { + "epoch": 0.43671288640122125, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9840725660324097, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.856002151966095, + "num_tokens": 131060132.0, + "step": 3433 + }, + { + "epoch": 0.4368400966798117, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9328197240829468, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8459528684616089, + "num_tokens": 131105109.0, + "step": 3434 + }, + { + "epoch": 0.43696730695840225, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9060643911361694, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8728761672973633, + "num_tokens": 131142094.0, + "step": 3435 + }, + { + "epoch": 0.4370945172369928, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.111750841140747, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8555598258972168, + "num_tokens": 131174648.0, + "step": 3436 + }, + { + "epoch": 0.43722172751558325, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.831388235092163, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8718523979187012, + "num_tokens": 131213447.0, + "step": 3437 + }, + { + "epoch": 0.4373489377941738, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9553438425064087, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8531346321105957, + "num_tokens": 131249622.0, + "step": 3438 + }, + { + "epoch": 0.4374761480727643, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.7442578077316284, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8720943331718445, + "num_tokens": 131285883.0, + "step": 3439 + }, + { + "epoch": 0.4376033583513548, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.010075807571411, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8597163558006287, + "num_tokens": 131322774.0, + "step": 3440 + }, + { + "epoch": 0.4377305686299453, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.9430370330810547, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.870151698589325, + "num_tokens": 131364513.0, + "step": 3441 + }, + { + "epoch": 0.43785777890853583, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.8475940227508545, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8479335308074951, + "num_tokens": 131402299.0, + "step": 3442 + }, + { + "epoch": 0.4379849891871263, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.6406112909317017, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8646009564399719, + "num_tokens": 131447830.0, + "step": 3443 + }, + { + "epoch": 0.43811219946571683, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.957237958908081, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8503915667533875, + "num_tokens": 131481058.0, + "step": 3444 + }, + { + "epoch": 0.43823940974430736, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.082183361053467, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8543919324874878, + "num_tokens": 131518376.0, + "step": 3445 + }, + { + "epoch": 0.43836662002289783, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.046860933303833, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.843065083026886, + "num_tokens": 131553702.0, + "step": 3446 + }, + { + "epoch": 0.43849383030148836, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.0905776023864746, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8652126789093018, + "num_tokens": 131587026.0, + "step": 3447 + }, + { + "epoch": 0.4386210405800789, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8288143873214722, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8635238409042358, + "num_tokens": 131621145.0, + "step": 3448 + }, + { + "epoch": 0.43874825085866936, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8101634979248047, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8632924556732178, + "num_tokens": 131661618.0, + "step": 3449 + }, + { + "epoch": 0.4388754611372599, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9330520629882812, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8522302508354187, + "num_tokens": 131693840.0, + "step": 3450 + }, + { + "epoch": 0.4390026714158504, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9158532619476318, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8605289459228516, + "num_tokens": 131730856.0, + "step": 3451 + }, + { + "epoch": 0.4391298816944409, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8740875720977783, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8330090641975403, + "num_tokens": 131769172.0, + "step": 3452 + }, + { + "epoch": 0.4392570919730314, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.75331449508667, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8599590063095093, + "num_tokens": 131810481.0, + "step": 3453 + }, + { + "epoch": 0.43938430225162195, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9041613340377808, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.860325813293457, + "num_tokens": 131850742.0, + "step": 3454 + }, + { + "epoch": 0.4395115125302124, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.0289905071258545, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8558056950569153, + "num_tokens": 131886675.0, + "step": 3455 + }, + { + "epoch": 0.43963872280880295, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8223392963409424, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8636481761932373, + "num_tokens": 131926831.0, + "step": 3456 + }, + { + "epoch": 0.4397659330873935, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8649638891220093, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.857467770576477, + "num_tokens": 131962265.0, + "step": 3457 + }, + { + "epoch": 0.43989314336598395, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.330274820327759, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8525109887123108, + "num_tokens": 131999906.0, + "step": 3458 + }, + { + "epoch": 0.4400203536445745, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.106966495513916, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.873029887676239, + "num_tokens": 132031330.0, + "step": 3459 + }, + { + "epoch": 0.440147563923165, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9792284965515137, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8567352294921875, + "num_tokens": 132065049.0, + "step": 3460 + }, + { + "epoch": 0.4402747742017555, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8352751731872559, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8625723719596863, + "num_tokens": 132100993.0, + "step": 3461 + }, + { + "epoch": 0.440401984480346, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.972684383392334, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8566523790359497, + "num_tokens": 132138169.0, + "step": 3462 + }, + { + "epoch": 0.44052919475893654, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.1593990325927734, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8646287322044373, + "num_tokens": 132170498.0, + "step": 3463 + }, + { + "epoch": 0.440656405037527, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9198099374771118, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8393175601959229, + "num_tokens": 132213583.0, + "step": 3464 + }, + { + "epoch": 0.44078361531611754, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.708944320678711, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8538280725479126, + "num_tokens": 132255427.0, + "step": 3465 + }, + { + "epoch": 0.44091082559470807, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8001421689987183, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8560870885848999, + "num_tokens": 132295795.0, + "step": 3466 + }, + { + "epoch": 0.44103803587329854, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.826331615447998, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8404174447059631, + "num_tokens": 132336881.0, + "step": 3467 + }, + { + "epoch": 0.44116524615188907, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.756036400794983, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8579609394073486, + "num_tokens": 132375136.0, + "step": 3468 + }, + { + "epoch": 0.4412924564304796, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9534627199172974, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8546926379203796, + "num_tokens": 132412893.0, + "step": 3469 + }, + { + "epoch": 0.44141966670907007, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.2570066452026367, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8527945280075073, + "num_tokens": 132444360.0, + "step": 3470 + }, + { + "epoch": 0.4415468769876606, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9379686117172241, + "learning_rate": 1e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.8325560092926025, + "num_tokens": 132485022.0, + "step": 3471 + }, + { + "epoch": 0.4416740872662511, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8838684558868408, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8425737619400024, + "num_tokens": 132522800.0, + "step": 3472 + }, + { + "epoch": 0.4418012975448416, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0444324016571045, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8594593405723572, + "num_tokens": 132560527.0, + "step": 3473 + }, + { + "epoch": 0.4419285078234321, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.892386794090271, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8554152846336365, + "num_tokens": 132601075.0, + "step": 3474 + }, + { + "epoch": 0.44205571810202265, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.026984930038452, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8472413420677185, + "num_tokens": 132639851.0, + "step": 3475 + }, + { + "epoch": 0.4421829283806131, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.849757432937622, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8586381673812866, + "num_tokens": 132679849.0, + "step": 3476 + }, + { + "epoch": 0.44231013865920366, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9281798601150513, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8618614673614502, + "num_tokens": 132721020.0, + "step": 3477 + }, + { + "epoch": 0.4424373489377942, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.96941077709198, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8442350625991821, + "num_tokens": 132760933.0, + "step": 3478 + }, + { + "epoch": 0.44256455921638466, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8847535848617554, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8672648072242737, + "num_tokens": 132800627.0, + "step": 3479 + }, + { + "epoch": 0.4426917694949752, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.060049057006836, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8576123714447021, + "num_tokens": 132833232.0, + "step": 3480 + }, + { + "epoch": 0.4428189797735657, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7812563180923462, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8549301624298096, + "num_tokens": 132874670.0, + "step": 3481 + }, + { + "epoch": 0.44294619005215624, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.5709946155548096, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8595139980316162, + "num_tokens": 132916187.0, + "step": 3482 + }, + { + "epoch": 0.4430734003307467, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9315072298049927, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8596539497375488, + "num_tokens": 132956948.0, + "step": 3483 + }, + { + "epoch": 0.44320061060933724, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.230334997177124, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8628551959991455, + "num_tokens": 132995213.0, + "step": 3484 + }, + { + "epoch": 0.44332782088792777, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.892090082168579, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8541218638420105, + "num_tokens": 133038850.0, + "step": 3485 + }, + { + "epoch": 0.44345503116651824, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8626052141189575, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8584363460540771, + "num_tokens": 133080766.0, + "step": 3486 + }, + { + "epoch": 0.44358224144510877, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8585985898971558, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8537463545799255, + "num_tokens": 133120845.0, + "step": 3487 + }, + { + "epoch": 0.4437094517236993, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.7010128498077393, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8544477224349976, + "num_tokens": 133166011.0, + "step": 3488 + }, + { + "epoch": 0.4438366620022898, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9592127799987793, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8548204898834229, + "num_tokens": 133203507.0, + "step": 3489 + }, + { + "epoch": 0.4439638722808803, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9972937107086182, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8333858847618103, + "num_tokens": 133240554.0, + "step": 3490 + }, + { + "epoch": 0.44409108255947083, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.253157615661621, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.838909924030304, + "num_tokens": 133274987.0, + "step": 3491 + }, + { + "epoch": 0.4442182928380613, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.110321044921875, + "learning_rate": 1e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.8290920257568359, + "num_tokens": 133319287.0, + "step": 3492 + }, + { + "epoch": 0.44434550311665183, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1541404724121094, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8616564273834229, + "num_tokens": 133353670.0, + "step": 3493 + }, + { + "epoch": 0.44447271339524236, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9247848987579346, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8575866222381592, + "num_tokens": 133391950.0, + "step": 3494 + }, + { + "epoch": 0.44459992367383283, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8560281991958618, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8629201650619507, + "num_tokens": 133424758.0, + "step": 3495 + }, + { + "epoch": 0.44472713395242336, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.1422266960144043, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8853528499603271, + "num_tokens": 133459869.0, + "step": 3496 + }, + { + "epoch": 0.4448543442310139, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1802263259887695, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8556444644927979, + "num_tokens": 133494598.0, + "step": 3497 + }, + { + "epoch": 0.44498155450960436, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8656896352767944, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8644193410873413, + "num_tokens": 133536403.0, + "step": 3498 + }, + { + "epoch": 0.4451087647881949, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1178648471832275, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8371086120605469, + "num_tokens": 133567319.0, + "step": 3499 + }, + { + "epoch": 0.4452359750667854, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.1613986492156982, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.848792552947998, + "num_tokens": 133604829.0, + "step": 3500 + }, + { + "epoch": 0.4453631853453759, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8823461532592773, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8562872409820557, + "num_tokens": 133638373.0, + "step": 3501 + }, + { + "epoch": 0.4454903956239664, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9188686609268188, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.867118239402771, + "num_tokens": 133674110.0, + "step": 3502 + }, + { + "epoch": 0.44561760590255695, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9715403318405151, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.858664333820343, + "num_tokens": 133711005.0, + "step": 3503 + }, + { + "epoch": 0.4457448161811474, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8510504961013794, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.870025634765625, + "num_tokens": 133746258.0, + "step": 3504 + }, + { + "epoch": 0.44587202645973795, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8083645105361938, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.86529541015625, + "num_tokens": 133790695.0, + "step": 3505 + }, + { + "epoch": 0.4459992367383285, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 3.03347110748291, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8542348146438599, + "num_tokens": 133830181.0, + "step": 3506 + }, + { + "epoch": 0.44612644701691895, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8963276147842407, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8622448444366455, + "num_tokens": 133866849.0, + "step": 3507 + }, + { + "epoch": 0.4462536572955095, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.001497268676758, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8569259643554688, + "num_tokens": 133906812.0, + "step": 3508 + }, + { + "epoch": 0.4463808675741, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.180084228515625, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8511128425598145, + "num_tokens": 133948956.0, + "step": 3509 + }, + { + "epoch": 0.4465080778526905, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.906115174293518, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8400532007217407, + "num_tokens": 133988764.0, + "step": 3510 + }, + { + "epoch": 0.446635288131281, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.0377919673919678, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.862115740776062, + "num_tokens": 134028148.0, + "step": 3511 + }, + { + "epoch": 0.44676249840987153, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1071181297302246, + "learning_rate": 1e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8342564105987549, + "num_tokens": 134065909.0, + "step": 3512 + }, + { + "epoch": 0.446889708688462, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.7221733331680298, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8629173636436462, + "num_tokens": 134108342.0, + "step": 3513 + }, + { + "epoch": 0.44701691896705253, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8593207597732544, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8558825254440308, + "num_tokens": 134145928.0, + "step": 3514 + }, + { + "epoch": 0.44714412924564306, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9482542276382446, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8521470427513123, + "num_tokens": 134185955.0, + "step": 3515 + }, + { + "epoch": 0.44727133952423354, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.7324485778808594, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8629179000854492, + "num_tokens": 134225829.0, + "step": 3516 + }, + { + "epoch": 0.44739854980282406, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.0444765090942383, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8474922180175781, + "num_tokens": 134261381.0, + "step": 3517 + }, + { + "epoch": 0.4475257600814146, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 3.085193157196045, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8655716776847839, + "num_tokens": 134294697.0, + "step": 3518 + }, + { + "epoch": 0.44765297036000506, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8181360960006714, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8774641752243042, + "num_tokens": 134334349.0, + "step": 3519 + }, + { + "epoch": 0.4477801806385956, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9248408079147339, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8454912900924683, + "num_tokens": 134373381.0, + "step": 3520 + }, + { + "epoch": 0.4479073909171861, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8936413526535034, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.862648069858551, + "num_tokens": 134407952.0, + "step": 3521 + }, + { + "epoch": 0.4480346011957766, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.778432011604309, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8468680381774902, + "num_tokens": 134458862.0, + "step": 3522 + }, + { + "epoch": 0.4481618114743671, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9139509201049805, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8543027639389038, + "num_tokens": 134494007.0, + "step": 3523 + }, + { + "epoch": 0.44828902175295765, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1063172817230225, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8425577878952026, + "num_tokens": 134534195.0, + "step": 3524 + }, + { + "epoch": 0.4484162320315481, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9578436613082886, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8644511103630066, + "num_tokens": 134570241.0, + "step": 3525 + }, + { + "epoch": 0.44854344231013865, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.7824212312698364, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8557005524635315, + "num_tokens": 134611904.0, + "step": 3526 + }, + { + "epoch": 0.4486706525887292, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.0032031536102295, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8476833701133728, + "num_tokens": 134644426.0, + "step": 3527 + }, + { + "epoch": 0.44879786286731965, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.917829990386963, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8657263517379761, + "num_tokens": 134678979.0, + "step": 3528 + }, + { + "epoch": 0.4489250731459102, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9756439924240112, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8597457408905029, + "num_tokens": 134717270.0, + "step": 3529 + }, + { + "epoch": 0.4490522834245007, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.0790247917175293, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8576079607009888, + "num_tokens": 134755464.0, + "step": 3530 + }, + { + "epoch": 0.44917949370309124, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.7465959787368774, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.850025475025177, + "num_tokens": 134793737.0, + "step": 3531 + }, + { + "epoch": 0.4493067039816817, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.027366876602173, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8515981435775757, + "num_tokens": 134834332.0, + "step": 3532 + }, + { + "epoch": 0.44943391426027224, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8790521621704102, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8681728839874268, + "num_tokens": 134871027.0, + "step": 3533 + }, + { + "epoch": 0.44956112453886277, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9121280908584595, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8584557175636292, + "num_tokens": 134908529.0, + "step": 3534 + }, + { + "epoch": 0.44968833481745324, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.0534799098968506, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8662883043289185, + "num_tokens": 134944177.0, + "step": 3535 + }, + { + "epoch": 0.44981554509604377, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.101259708404541, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.855736494064331, + "num_tokens": 134977928.0, + "step": 3536 + }, + { + "epoch": 0.4499427553746343, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.820539951324463, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8569350242614746, + "num_tokens": 135017554.0, + "step": 3537 + }, + { + "epoch": 0.45006996565322477, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.7167726755142212, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.863476037979126, + "num_tokens": 135058694.0, + "step": 3538 + }, + { + "epoch": 0.4501971759318153, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.93867826461792, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8597979545593262, + "num_tokens": 135094037.0, + "step": 3539 + }, + { + "epoch": 0.4503243862104058, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8358980417251587, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8513745069503784, + "num_tokens": 135132758.0, + "step": 3540 + }, + { + "epoch": 0.4504515964889963, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.130147933959961, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8694539666175842, + "num_tokens": 135169398.0, + "step": 3541 + }, + { + "epoch": 0.4505788067675868, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.0254766941070557, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8432034850120544, + "num_tokens": 135204312.0, + "step": 3542 + }, + { + "epoch": 0.45070601704617735, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8310027122497559, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8542300462722778, + "num_tokens": 135246287.0, + "step": 3543 + }, + { + "epoch": 0.4508332273247678, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8128345012664795, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8612207174301147, + "num_tokens": 135280970.0, + "step": 3544 + }, + { + "epoch": 0.45096043760335836, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9252545833587646, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8790249824523926, + "num_tokens": 135317873.0, + "step": 3545 + }, + { + "epoch": 0.4510876478819489, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9113491773605347, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8643942475318909, + "num_tokens": 135354994.0, + "step": 3546 + }, + { + "epoch": 0.45121485816053936, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9801063537597656, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8517547845840454, + "num_tokens": 135393050.0, + "step": 3547 + }, + { + "epoch": 0.4513420684391299, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.9035505056381226, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8617159128189087, + "num_tokens": 135431704.0, + "step": 3548 + }, + { + "epoch": 0.4514692787177204, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8395118713378906, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8357602953910828, + "num_tokens": 135471306.0, + "step": 3549 + }, + { + "epoch": 0.4515964889963109, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.004697799682617, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8451679944992065, + "num_tokens": 135509647.0, + "step": 3550 + }, + { + "epoch": 0.4517236992749014, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0679194927215576, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8650072813034058, + "num_tokens": 135550469.0, + "step": 3551 + }, + { + "epoch": 0.45185090955349194, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8388038873672485, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.856747031211853, + "num_tokens": 135589290.0, + "step": 3552 + }, + { + "epoch": 0.4519781198320824, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.89396071434021, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8510401844978333, + "num_tokens": 135624416.0, + "step": 3553 + }, + { + "epoch": 0.45210533011067294, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8988555669784546, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8613581657409668, + "num_tokens": 135662316.0, + "step": 3554 + }, + { + "epoch": 0.45223254038926347, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8775042295455933, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8585836887359619, + "num_tokens": 135701739.0, + "step": 3555 + }, + { + "epoch": 0.45235975066785394, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9756083488464355, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8534947037696838, + "num_tokens": 135741372.0, + "step": 3556 + }, + { + "epoch": 0.45248696094644447, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9246015548706055, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8791796565055847, + "num_tokens": 135779945.0, + "step": 3557 + }, + { + "epoch": 0.452614171225035, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9587570428848267, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8517347574234009, + "num_tokens": 135813179.0, + "step": 3558 + }, + { + "epoch": 0.4527413815036255, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9670034646987915, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8607295155525208, + "num_tokens": 135855147.0, + "step": 3559 + }, + { + "epoch": 0.452868591782216, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8876374959945679, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8650023341178894, + "num_tokens": 135892990.0, + "step": 3560 + }, + { + "epoch": 0.45299580206080653, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 1.8617253303527832, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8751800060272217, + "num_tokens": 135926911.0, + "step": 3561 + }, + { + "epoch": 0.453123012339397, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.045376777648926, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8659323453903198, + "num_tokens": 135963530.0, + "step": 3562 + }, + { + "epoch": 0.45325022261798753, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.94716477394104, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8493712544441223, + "num_tokens": 136000677.0, + "step": 3563 + }, + { + "epoch": 0.45337743289657806, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9099035263061523, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.847540020942688, + "num_tokens": 136037283.0, + "step": 3564 + }, + { + "epoch": 0.45350464317516853, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.1381328105926514, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8398488759994507, + "num_tokens": 136071454.0, + "step": 3565 + }, + { + "epoch": 0.45363185345375906, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.848063588142395, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8456621766090393, + "num_tokens": 136105638.0, + "step": 3566 + }, + { + "epoch": 0.4537590637323496, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7847657203674316, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8607074022293091, + "num_tokens": 136147968.0, + "step": 3567 + }, + { + "epoch": 0.45388627401094006, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8390932083129883, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8574322462081909, + "num_tokens": 136190468.0, + "step": 3568 + }, + { + "epoch": 0.4540134842895306, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9282965660095215, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8613929748535156, + "num_tokens": 136229363.0, + "step": 3569 + }, + { + "epoch": 0.4541406945681211, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7783594131469727, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8515142202377319, + "num_tokens": 136268834.0, + "step": 3570 + }, + { + "epoch": 0.4542679048467116, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9448848962783813, + "learning_rate": 1e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.834459125995636, + "num_tokens": 136306761.0, + "step": 3571 + }, + { + "epoch": 0.4543951151253021, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7807749509811401, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8563502430915833, + "num_tokens": 136347736.0, + "step": 3572 + }, + { + "epoch": 0.45452232540389265, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0285890102386475, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.859930157661438, + "num_tokens": 136381009.0, + "step": 3573 + }, + { + "epoch": 0.4546495356824831, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9868470430374146, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.868944525718689, + "num_tokens": 136418050.0, + "step": 3574 + }, + { + "epoch": 0.45477674596107365, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.822640061378479, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8404254913330078, + "num_tokens": 136459109.0, + "step": 3575 + }, + { + "epoch": 0.4549039562396642, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8466460704803467, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8530543446540833, + "num_tokens": 136497993.0, + "step": 3576 + }, + { + "epoch": 0.45503116651825465, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.028855085372925, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8600318431854248, + "num_tokens": 136534446.0, + "step": 3577 + }, + { + "epoch": 0.4551583767968452, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0235595703125, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8408883213996887, + "num_tokens": 136571117.0, + "step": 3578 + }, + { + "epoch": 0.4552855870754357, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0125720500946045, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8634658455848694, + "num_tokens": 136601706.0, + "step": 3579 + }, + { + "epoch": 0.4554127973540262, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8180986642837524, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8575133085250854, + "num_tokens": 136639735.0, + "step": 3580 + }, + { + "epoch": 0.4555400076326167, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8903459310531616, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8509433269500732, + "num_tokens": 136675647.0, + "step": 3581 + }, + { + "epoch": 0.45566721791120723, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9601771831512451, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8662472367286682, + "num_tokens": 136713361.0, + "step": 3582 + }, + { + "epoch": 0.45579442818979776, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8361726999282837, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.856438159942627, + "num_tokens": 136751440.0, + "step": 3583 + }, + { + "epoch": 0.45592163846838824, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.2599847316741943, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8531469106674194, + "num_tokens": 136786078.0, + "step": 3584 + }, + { + "epoch": 0.45604884874697876, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7737324237823486, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8640098571777344, + "num_tokens": 136828313.0, + "step": 3585 + }, + { + "epoch": 0.4561760590255693, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9893078804016113, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8557866811752319, + "num_tokens": 136862549.0, + "step": 3586 + }, + { + "epoch": 0.45630326930415976, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8812365531921387, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8598895072937012, + "num_tokens": 136906423.0, + "step": 3587 + }, + { + "epoch": 0.4564304795827503, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9076554775238037, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8469136953353882, + "num_tokens": 136941550.0, + "step": 3588 + }, + { + "epoch": 0.4565576898613408, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7867835760116577, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8614598512649536, + "num_tokens": 136980234.0, + "step": 3589 + }, + { + "epoch": 0.4566849001399313, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.759353756904602, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8608043193817139, + "num_tokens": 137023991.0, + "step": 3590 + }, + { + "epoch": 0.4568121104185218, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0577328205108643, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.848166286945343, + "num_tokens": 137063261.0, + "step": 3591 + }, + { + "epoch": 0.45693932069711235, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7763886451721191, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8667794466018677, + "num_tokens": 137103935.0, + "step": 3592 + }, + { + "epoch": 0.4570665309757028, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9246041774749756, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8567616939544678, + "num_tokens": 137140194.0, + "step": 3593 + }, + { + "epoch": 0.45719374125429335, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0245730876922607, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8472784757614136, + "num_tokens": 137179086.0, + "step": 3594 + }, + { + "epoch": 0.4573209515328839, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.960159182548523, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8628740906715393, + "num_tokens": 137216887.0, + "step": 3595 + }, + { + "epoch": 0.45744816181147435, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8148506879806519, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8733721971511841, + "num_tokens": 137255029.0, + "step": 3596 + }, + { + "epoch": 0.4575753720900649, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8417707681655884, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8604742288589478, + "num_tokens": 137291399.0, + "step": 3597 + }, + { + "epoch": 0.4577025823686554, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.902951717376709, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8626307845115662, + "num_tokens": 137326569.0, + "step": 3598 + }, + { + "epoch": 0.4578297926472459, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7711831331253052, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8531091809272766, + "num_tokens": 137370035.0, + "step": 3599 + }, + { + "epoch": 0.4579570029258364, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9289453029632568, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8533867001533508, + "num_tokens": 137407969.0, + "step": 3600 + }, + { + "epoch": 0.45808421320442694, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.785950779914856, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8557503819465637, + "num_tokens": 137447256.0, + "step": 3601 + }, + { + "epoch": 0.4582114234830174, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9581711292266846, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8433947563171387, + "num_tokens": 137479762.0, + "step": 3602 + }, + { + "epoch": 0.45833863376160794, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.852589726448059, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8490054607391357, + "num_tokens": 137521823.0, + "step": 3603 + }, + { + "epoch": 0.45846584404019847, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9272456169128418, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8557807207107544, + "num_tokens": 137558174.0, + "step": 3604 + }, + { + "epoch": 0.45859305431878894, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9368479251861572, + "learning_rate": 1e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.833242654800415, + "num_tokens": 137597265.0, + "step": 3605 + }, + { + "epoch": 0.45872026459737947, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7688379287719727, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8668884038925171, + "num_tokens": 137635954.0, + "step": 3606 + }, + { + "epoch": 0.45884747487597, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.866363286972046, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8594865798950195, + "num_tokens": 137676137.0, + "step": 3607 + }, + { + "epoch": 0.45897468515456047, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0135059356689453, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8607320785522461, + "num_tokens": 137716723.0, + "step": 3608 + }, + { + "epoch": 0.459101895433151, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9296361207962036, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8401638865470886, + "num_tokens": 137752435.0, + "step": 3609 + }, + { + "epoch": 0.4592291057117415, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9381848573684692, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8492254614830017, + "num_tokens": 137786684.0, + "step": 3610 + }, + { + "epoch": 0.459356315990332, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9726274013519287, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.84845370054245, + "num_tokens": 137830668.0, + "step": 3611 + }, + { + "epoch": 0.4594835262689225, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0366408824920654, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8642895817756653, + "num_tokens": 137865262.0, + "step": 3612 + }, + { + "epoch": 0.45961073654751305, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8749641180038452, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8645431995391846, + "num_tokens": 137898867.0, + "step": 3613 + }, + { + "epoch": 0.4597379468261035, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.9156767129898071, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8581923246383667, + "num_tokens": 137935108.0, + "step": 3614 + }, + { + "epoch": 0.45986515710469406, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8418891429901123, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8684221506118774, + "num_tokens": 137977714.0, + "step": 3615 + }, + { + "epoch": 0.4599923673832846, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.090786933898926, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8682429194450378, + "num_tokens": 138006345.0, + "step": 3616 + }, + { + "epoch": 0.46011957766187506, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.8756648302078247, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8569336533546448, + "num_tokens": 138045106.0, + "step": 3617 + }, + { + "epoch": 0.4602467879404656, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0436131954193115, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8510710597038269, + "num_tokens": 138079993.0, + "step": 3618 + }, + { + "epoch": 0.4603739982190561, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.032081365585327, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.860552191734314, + "num_tokens": 138118555.0, + "step": 3619 + }, + { + "epoch": 0.4605012084976466, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8735570907592773, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8654558658599854, + "num_tokens": 138155096.0, + "step": 3620 + }, + { + "epoch": 0.4606284187762371, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.877371907234192, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8601034879684448, + "num_tokens": 138194529.0, + "step": 3621 + }, + { + "epoch": 0.46075562905482764, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8896911144256592, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8321527242660522, + "num_tokens": 138235520.0, + "step": 3622 + }, + { + "epoch": 0.4608828393334181, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.085314989089966, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8676549196243286, + "num_tokens": 138271391.0, + "step": 3623 + }, + { + "epoch": 0.46101004961200864, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.7980655431747437, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8523504734039307, + "num_tokens": 138313923.0, + "step": 3624 + }, + { + "epoch": 0.46113725989059917, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.7115026712417603, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.849239706993103, + "num_tokens": 138355992.0, + "step": 3625 + }, + { + "epoch": 0.46126447016918964, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8092573881149292, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8768873810768127, + "num_tokens": 138393074.0, + "step": 3626 + }, + { + "epoch": 0.4613916804477802, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.9821721315383911, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8647744059562683, + "num_tokens": 138427854.0, + "step": 3627 + }, + { + "epoch": 0.4615188907263707, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.9267253875732422, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8669970035552979, + "num_tokens": 138467359.0, + "step": 3628 + }, + { + "epoch": 0.4616461010049612, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.239231824874878, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8457692861557007, + "num_tokens": 138505355.0, + "step": 3629 + }, + { + "epoch": 0.4617733112835517, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8421955108642578, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.861306369304657, + "num_tokens": 138543703.0, + "step": 3630 + }, + { + "epoch": 0.46190052156214223, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8305890560150146, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.85013747215271, + "num_tokens": 138579860.0, + "step": 3631 + }, + { + "epoch": 0.46202773184073276, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.9675418138504028, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8674702048301697, + "num_tokens": 138615755.0, + "step": 3632 + }, + { + "epoch": 0.46215494211932323, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.055171012878418, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8512166738510132, + "num_tokens": 138647607.0, + "step": 3633 + }, + { + "epoch": 0.46228215239791376, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.1004531383514404, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8635494709014893, + "num_tokens": 138679284.0, + "step": 3634 + }, + { + "epoch": 0.4624093626765043, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8829338550567627, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8680896162986755, + "num_tokens": 138719690.0, + "step": 3635 + }, + { + "epoch": 0.46253657295509476, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.884642243385315, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8465056419372559, + "num_tokens": 138758031.0, + "step": 3636 + }, + { + "epoch": 0.4626637832336853, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8514692783355713, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8571161031723022, + "num_tokens": 138797283.0, + "step": 3637 + }, + { + "epoch": 0.4627909935122758, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8416051864624023, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8552569150924683, + "num_tokens": 138834672.0, + "step": 3638 + }, + { + "epoch": 0.4629182037908663, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.899110198020935, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8681871891021729, + "num_tokens": 138871105.0, + "step": 3639 + }, + { + "epoch": 0.4630454140694568, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.9921672344207764, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8600592017173767, + "num_tokens": 138903516.0, + "step": 3640 + }, + { + "epoch": 0.46317262434804735, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.787561297416687, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8559715747833252, + "num_tokens": 138942021.0, + "step": 3641 + }, + { + "epoch": 0.4632998346266378, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.947662115097046, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8668876886367798, + "num_tokens": 138978210.0, + "step": 3642 + }, + { + "epoch": 0.46342704490522835, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8405014276504517, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8607783317565918, + "num_tokens": 139019122.0, + "step": 3643 + }, + { + "epoch": 0.4635542551838189, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.76736319065094, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8687549829483032, + "num_tokens": 139062187.0, + "step": 3644 + }, + { + "epoch": 0.46368146546240935, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 20.450355529785156, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8770825862884521, + "num_tokens": 139102960.0, + "step": 3645 + }, + { + "epoch": 0.4638086757409999, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.051990032196045, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8604772090911865, + "num_tokens": 139137302.0, + "step": 3646 + }, + { + "epoch": 0.4639358860195904, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.067276954650879, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8399440050125122, + "num_tokens": 139177195.0, + "step": 3647 + }, + { + "epoch": 0.4640630962981809, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.994911551475525, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8537105321884155, + "num_tokens": 139214722.0, + "step": 3648 + }, + { + "epoch": 0.4641903065767714, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.870922565460205, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8566522598266602, + "num_tokens": 139248569.0, + "step": 3649 + }, + { + "epoch": 0.46431751685536193, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7808831930160522, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8645879626274109, + "num_tokens": 139289141.0, + "step": 3650 + }, + { + "epoch": 0.4644447271339524, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7862366437911987, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8755686283111572, + "num_tokens": 139330791.0, + "step": 3651 + }, + { + "epoch": 0.46457193741254293, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8654175996780396, + "learning_rate": 1e-06, + "loss": 0.5476, + "mean_token_accuracy": 0.8256336450576782, + "num_tokens": 139373009.0, + "step": 3652 + }, + { + "epoch": 0.46469914769113346, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.969191312789917, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8572790622711182, + "num_tokens": 139408894.0, + "step": 3653 + }, + { + "epoch": 0.46482635796972394, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.919192910194397, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8567025661468506, + "num_tokens": 139444040.0, + "step": 3654 + }, + { + "epoch": 0.46495356824831446, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9783122539520264, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8637384176254272, + "num_tokens": 139479675.0, + "step": 3655 + }, + { + "epoch": 0.465080778526905, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8226869106292725, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8656277656555176, + "num_tokens": 139521792.0, + "step": 3656 + }, + { + "epoch": 0.46520798880549546, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8040779829025269, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8649302124977112, + "num_tokens": 139563471.0, + "step": 3657 + }, + { + "epoch": 0.465335199084086, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.770981788635254, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8585149645805359, + "num_tokens": 139604702.0, + "step": 3658 + }, + { + "epoch": 0.4654624093626765, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7864985466003418, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8568044900894165, + "num_tokens": 139644573.0, + "step": 3659 + }, + { + "epoch": 0.465589619641267, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9426424503326416, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8449460864067078, + "num_tokens": 139680924.0, + "step": 3660 + }, + { + "epoch": 0.4657168299198575, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8729677200317383, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.85017991065979, + "num_tokens": 139724369.0, + "step": 3661 + }, + { + "epoch": 0.46584404019844805, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7605719566345215, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8579610586166382, + "num_tokens": 139769412.0, + "step": 3662 + }, + { + "epoch": 0.4659712504770385, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0594449043273926, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8345702886581421, + "num_tokens": 139808066.0, + "step": 3663 + }, + { + "epoch": 0.46609846075562905, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9365386962890625, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8637694120407104, + "num_tokens": 139851169.0, + "step": 3664 + }, + { + "epoch": 0.4662256710342196, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.83018159866333, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8670942783355713, + "num_tokens": 139890550.0, + "step": 3665 + }, + { + "epoch": 0.46635288131281005, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.919482946395874, + "learning_rate": 1e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8318509459495544, + "num_tokens": 139931496.0, + "step": 3666 + }, + { + "epoch": 0.4664800915914006, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.044630289077759, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8485672473907471, + "num_tokens": 139965872.0, + "step": 3667 + }, + { + "epoch": 0.4666073018699911, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0425755977630615, + "learning_rate": 1e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8426259160041809, + "num_tokens": 140000394.0, + "step": 3668 + }, + { + "epoch": 0.4667345121485816, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.018096923828125, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8423111438751221, + "num_tokens": 140039967.0, + "step": 3669 + }, + { + "epoch": 0.4668617224271721, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0187630653381348, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8584750890731812, + "num_tokens": 140075739.0, + "step": 3670 + }, + { + "epoch": 0.46698893270576264, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9209730625152588, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.86133873462677, + "num_tokens": 140110016.0, + "step": 3671 + }, + { + "epoch": 0.4671161429843531, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9154679775238037, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8487110137939453, + "num_tokens": 140147905.0, + "step": 3672 + }, + { + "epoch": 0.46724335326294364, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.214045763015747, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8527661561965942, + "num_tokens": 140182284.0, + "step": 3673 + }, + { + "epoch": 0.46737056354153417, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.883209466934204, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8504071831703186, + "num_tokens": 140220669.0, + "step": 3674 + }, + { + "epoch": 0.46749777382012464, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9667298793792725, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8436942100524902, + "num_tokens": 140258399.0, + "step": 3675 + }, + { + "epoch": 0.46762498409871517, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8509793281555176, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8718176484107971, + "num_tokens": 140296015.0, + "step": 3676 + }, + { + "epoch": 0.4677521943773057, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7526017427444458, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8479049205780029, + "num_tokens": 140338618.0, + "step": 3677 + }, + { + "epoch": 0.46787940465589617, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.890147089958191, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.861307680606842, + "num_tokens": 140379151.0, + "step": 3678 + }, + { + "epoch": 0.4680066149344867, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.799842357635498, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8441064357757568, + "num_tokens": 140417740.0, + "step": 3679 + }, + { + "epoch": 0.4681338252130772, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.3339407444000244, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8531899452209473, + "num_tokens": 140452454.0, + "step": 3680 + }, + { + "epoch": 0.46826103549166775, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9759018421173096, + "learning_rate": 1e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8345799446105957, + "num_tokens": 140492981.0, + "step": 3681 + }, + { + "epoch": 0.4683882457702582, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8407703638076782, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.845761775970459, + "num_tokens": 140534131.0, + "step": 3682 + }, + { + "epoch": 0.46851545604884876, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8622287511825562, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8493729829788208, + "num_tokens": 140576761.0, + "step": 3683 + }, + { + "epoch": 0.4686426663274393, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0440099239349365, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.837678074836731, + "num_tokens": 140609538.0, + "step": 3684 + }, + { + "epoch": 0.46876987660602976, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0536351203918457, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8522691130638123, + "num_tokens": 140641802.0, + "step": 3685 + }, + { + "epoch": 0.4688970868846203, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8777800798416138, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8444911241531372, + "num_tokens": 140685832.0, + "step": 3686 + }, + { + "epoch": 0.4690242971632108, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.093693733215332, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8441898822784424, + "num_tokens": 140725333.0, + "step": 3687 + }, + { + "epoch": 0.4691515074418013, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.116394281387329, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8541812896728516, + "num_tokens": 140760838.0, + "step": 3688 + }, + { + "epoch": 0.4692787177203918, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0191731452941895, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8463510274887085, + "num_tokens": 140797367.0, + "step": 3689 + }, + { + "epoch": 0.46940592799898234, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0705649852752686, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8549370765686035, + "num_tokens": 140839778.0, + "step": 3690 + }, + { + "epoch": 0.4695331382775728, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.7518649101257324, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8515238761901855, + "num_tokens": 140880340.0, + "step": 3691 + }, + { + "epoch": 0.46966034855616334, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 4.112737655639648, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8506407737731934, + "num_tokens": 140915451.0, + "step": 3692 + }, + { + "epoch": 0.46978755883475387, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.012446403503418, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8531746864318848, + "num_tokens": 140956967.0, + "step": 3693 + }, + { + "epoch": 0.46991476911334434, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8533141613006592, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8711000680923462, + "num_tokens": 140996640.0, + "step": 3694 + }, + { + "epoch": 0.47004197939193487, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.996731162071228, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8542312383651733, + "num_tokens": 141036094.0, + "step": 3695 + }, + { + "epoch": 0.4701691896705254, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.059845209121704, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8475598096847534, + "num_tokens": 141069033.0, + "step": 3696 + }, + { + "epoch": 0.4702963999491159, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7620680332183838, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.872885525226593, + "num_tokens": 141111075.0, + "step": 3697 + }, + { + "epoch": 0.4704236102277064, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.8697649240493774, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8720481991767883, + "num_tokens": 141145270.0, + "step": 3698 + }, + { + "epoch": 0.47055082050629693, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.9082528352737427, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.870498776435852, + "num_tokens": 141182132.0, + "step": 3699 + }, + { + "epoch": 0.4706780307848874, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.010117769241333, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8569626212120056, + "num_tokens": 141217455.0, + "step": 3700 + }, + { + "epoch": 0.47080524106347793, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.134394884109497, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8562934398651123, + "num_tokens": 141249680.0, + "step": 3701 + }, + { + "epoch": 0.47093245134206846, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.9504474401474, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8550547957420349, + "num_tokens": 141292085.0, + "step": 3702 + }, + { + "epoch": 0.47105966162065893, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.757084846496582, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.864218533039093, + "num_tokens": 141331455.0, + "step": 3703 + }, + { + "epoch": 0.47118687189924946, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8786946535110474, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8557884693145752, + "num_tokens": 141365623.0, + "step": 3704 + }, + { + "epoch": 0.47131408217784, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.97214937210083, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8478729724884033, + "num_tokens": 141399033.0, + "step": 3705 + }, + { + "epoch": 0.47144129245643046, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.102653980255127, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8638807535171509, + "num_tokens": 141430436.0, + "step": 3706 + }, + { + "epoch": 0.471568502735021, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.0110397338867188, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8741332292556763, + "num_tokens": 141463169.0, + "step": 3707 + }, + { + "epoch": 0.4716957130136115, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.909447193145752, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8759653568267822, + "num_tokens": 141495524.0, + "step": 3708 + }, + { + "epoch": 0.471822923292202, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.43094539642334, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8682066798210144, + "num_tokens": 141533099.0, + "step": 3709 + }, + { + "epoch": 0.4719501335707925, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7508230209350586, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.862788200378418, + "num_tokens": 141576389.0, + "step": 3710 + }, + { + "epoch": 0.47207734384938305, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.907573938369751, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.85923171043396, + "num_tokens": 141611870.0, + "step": 3711 + }, + { + "epoch": 0.4722045541279735, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8408397436141968, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8672415614128113, + "num_tokens": 141648539.0, + "step": 3712 + }, + { + "epoch": 0.47233176440656405, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 1.9296315908432007, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8596317768096924, + "num_tokens": 141688549.0, + "step": 3713 + }, + { + "epoch": 0.4724589746851546, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.050579786300659, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8645834922790527, + "num_tokens": 141725294.0, + "step": 3714 + }, + { + "epoch": 0.47258618496374505, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.1172995567321777, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8403182029724121, + "num_tokens": 141756761.0, + "step": 3715 + }, + { + "epoch": 0.4727133952423356, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0603415966033936, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8428940176963806, + "num_tokens": 141790417.0, + "step": 3716 + }, + { + "epoch": 0.4728406055209261, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.752716064453125, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8683282136917114, + "num_tokens": 141830749.0, + "step": 3717 + }, + { + "epoch": 0.4729678157995166, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.029611110687256, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8616438508033752, + "num_tokens": 141868433.0, + "step": 3718 + }, + { + "epoch": 0.4730950260781071, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8859574794769287, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8754174709320068, + "num_tokens": 141907140.0, + "step": 3719 + }, + { + "epoch": 0.47322223635669763, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7893027067184448, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8713487386703491, + "num_tokens": 141947436.0, + "step": 3720 + }, + { + "epoch": 0.4733494466352881, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9616937637329102, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8502735495567322, + "num_tokens": 141986876.0, + "step": 3721 + }, + { + "epoch": 0.47347665691387864, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0418474674224854, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8602293729782104, + "num_tokens": 142025806.0, + "step": 3722 + }, + { + "epoch": 0.47360386719246916, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0946109294891357, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8573687076568604, + "num_tokens": 142057932.0, + "step": 3723 + }, + { + "epoch": 0.47373107747105964, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9332433938980103, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8623663187026978, + "num_tokens": 142088880.0, + "step": 3724 + }, + { + "epoch": 0.47385828774965016, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 3.0755698680877686, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8716899752616882, + "num_tokens": 142124959.0, + "step": 3725 + }, + { + "epoch": 0.4739854980282407, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.043095350265503, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8447582125663757, + "num_tokens": 142160667.0, + "step": 3726 + }, + { + "epoch": 0.47411270830683117, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8318029642105103, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8576779365539551, + "num_tokens": 142205017.0, + "step": 3727 + }, + { + "epoch": 0.4742399185854217, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0403435230255127, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8642660975456238, + "num_tokens": 142241479.0, + "step": 3728 + }, + { + "epoch": 0.4743671288640122, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9018136262893677, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8579443693161011, + "num_tokens": 142280881.0, + "step": 3729 + }, + { + "epoch": 0.4744943391426027, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.879109263420105, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8652212619781494, + "num_tokens": 142328248.0, + "step": 3730 + }, + { + "epoch": 0.4746215494211932, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.4649343490600586, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8422160148620605, + "num_tokens": 142364872.0, + "step": 3731 + }, + { + "epoch": 0.47474875969978375, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.989240050315857, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8404167890548706, + "num_tokens": 142400494.0, + "step": 3732 + }, + { + "epoch": 0.4748759699783743, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.3181798458099365, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8566457033157349, + "num_tokens": 142433739.0, + "step": 3733 + }, + { + "epoch": 0.47500318025696475, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.0750324726104736, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8663939237594604, + "num_tokens": 142469777.0, + "step": 3734 + }, + { + "epoch": 0.4751303905355553, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.988166093826294, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8480840921401978, + "num_tokens": 142510052.0, + "step": 3735 + }, + { + "epoch": 0.4752576008141458, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.1687352657318115, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8460038304328918, + "num_tokens": 142545689.0, + "step": 3736 + }, + { + "epoch": 0.4753848110927363, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9006582498550415, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.856121301651001, + "num_tokens": 142585916.0, + "step": 3737 + }, + { + "epoch": 0.4755120213713268, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9020699262619019, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.864696741104126, + "num_tokens": 142623019.0, + "step": 3738 + }, + { + "epoch": 0.47563923164991734, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8430213928222656, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.854202926158905, + "num_tokens": 142660986.0, + "step": 3739 + }, + { + "epoch": 0.4757664419285078, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7631429433822632, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8611348867416382, + "num_tokens": 142702876.0, + "step": 3740 + }, + { + "epoch": 0.47589365220709834, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.047168493270874, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8459545373916626, + "num_tokens": 142744082.0, + "step": 3741 + }, + { + "epoch": 0.47602086248568887, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9298897981643677, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8399667143821716, + "num_tokens": 142777511.0, + "step": 3742 + }, + { + "epoch": 0.47614807276427934, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8855745792388916, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8522707223892212, + "num_tokens": 142818882.0, + "step": 3743 + }, + { + "epoch": 0.47627528304286987, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0818536281585693, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8617730140686035, + "num_tokens": 142852771.0, + "step": 3744 + }, + { + "epoch": 0.4764024933214604, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.094369888305664, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8586844205856323, + "num_tokens": 142888398.0, + "step": 3745 + }, + { + "epoch": 0.47652970360005087, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.88063383102417, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8732467889785767, + "num_tokens": 142924789.0, + "step": 3746 + }, + { + "epoch": 0.4766569138786414, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.982008457183838, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8658872246742249, + "num_tokens": 142958843.0, + "step": 3747 + }, + { + "epoch": 0.4767841241572319, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.0358831882476807, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8686645030975342, + "num_tokens": 142996408.0, + "step": 3748 + }, + { + "epoch": 0.4769113344358224, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8243873119354248, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8497821092605591, + "num_tokens": 143040031.0, + "step": 3749 + }, + { + "epoch": 0.4770385447144129, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.4301700592041016, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8345868587493896, + "num_tokens": 143081648.0, + "step": 3750 + }, + { + "epoch": 0.47716575499300345, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.881886601448059, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.8361769318580627, + "num_tokens": 143118913.0, + "step": 3751 + }, + { + "epoch": 0.4772929652715939, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9702881574630737, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8662062883377075, + "num_tokens": 143153858.0, + "step": 3752 + }, + { + "epoch": 0.47742017555018446, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.313450574874878, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8686195611953735, + "num_tokens": 143190977.0, + "step": 3753 + }, + { + "epoch": 0.477547385828775, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8847172260284424, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8507659435272217, + "num_tokens": 143228009.0, + "step": 3754 + }, + { + "epoch": 0.47767459610736546, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8494044542312622, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8473693132400513, + "num_tokens": 143269820.0, + "step": 3755 + }, + { + "epoch": 0.477801806385956, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.851314067840576, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8615216016769409, + "num_tokens": 143303441.0, + "step": 3756 + }, + { + "epoch": 0.4779290166645465, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9883856773376465, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8505976796150208, + "num_tokens": 143338748.0, + "step": 3757 + }, + { + "epoch": 0.478056226943137, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9912370443344116, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8575766086578369, + "num_tokens": 143373900.0, + "step": 3758 + }, + { + "epoch": 0.4781834372217275, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.071981430053711, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8516844511032104, + "num_tokens": 143411190.0, + "step": 3759 + }, + { + "epoch": 0.47831064750031804, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8317304849624634, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8542494177818298, + "num_tokens": 143449470.0, + "step": 3760 + }, + { + "epoch": 0.4784378577789085, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.914899468421936, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8640726804733276, + "num_tokens": 143485868.0, + "step": 3761 + }, + { + "epoch": 0.47856506805749904, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.986930012702942, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8666138648986816, + "num_tokens": 143520334.0, + "step": 3762 + }, + { + "epoch": 0.47869227833608957, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7999235391616821, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8514951467514038, + "num_tokens": 143557752.0, + "step": 3763 + }, + { + "epoch": 0.47881948861468004, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9388362169265747, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8570163249969482, + "num_tokens": 143593385.0, + "step": 3764 + }, + { + "epoch": 0.4789466988932706, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.439253330230713, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8364806175231934, + "num_tokens": 143631975.0, + "step": 3765 + }, + { + "epoch": 0.4790739091718611, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.1584951877593994, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8405106067657471, + "num_tokens": 143668382.0, + "step": 3766 + }, + { + "epoch": 0.4792011194504516, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8795499801635742, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8441970348358154, + "num_tokens": 143706623.0, + "step": 3767 + }, + { + "epoch": 0.4793283297290421, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7766417264938354, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8624433279037476, + "num_tokens": 143745664.0, + "step": 3768 + }, + { + "epoch": 0.47945554000763263, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.020784378051758, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8481762409210205, + "num_tokens": 143780761.0, + "step": 3769 + }, + { + "epoch": 0.4795827502862231, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9174901247024536, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8626244068145752, + "num_tokens": 143825298.0, + "step": 3770 + }, + { + "epoch": 0.47970996056481363, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8632888793945312, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.876915454864502, + "num_tokens": 143859855.0, + "step": 3771 + }, + { + "epoch": 0.47983717084340416, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.539569139480591, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8704976439476013, + "num_tokens": 143898553.0, + "step": 3772 + }, + { + "epoch": 0.47996438112199463, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.02717661857605, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8545408844947815, + "num_tokens": 143935362.0, + "step": 3773 + }, + { + "epoch": 0.48009159140058516, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7948864698410034, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8582248687744141, + "num_tokens": 143978692.0, + "step": 3774 + }, + { + "epoch": 0.4802188016791757, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8045458793640137, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8645022511482239, + "num_tokens": 144020951.0, + "step": 3775 + }, + { + "epoch": 0.48034601195776616, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9104338884353638, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8665424585342407, + "num_tokens": 144058341.0, + "step": 3776 + }, + { + "epoch": 0.4804732222363567, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9471027851104736, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.850574254989624, + "num_tokens": 144098232.0, + "step": 3777 + }, + { + "epoch": 0.4806004325149472, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8005112409591675, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.852681040763855, + "num_tokens": 144137537.0, + "step": 3778 + }, + { + "epoch": 0.4807276427935377, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8999552726745605, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8637466430664062, + "num_tokens": 144178459.0, + "step": 3779 + }, + { + "epoch": 0.4808548530721282, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.913448452949524, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8521581888198853, + "num_tokens": 144219257.0, + "step": 3780 + }, + { + "epoch": 0.48098206335071875, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.804093599319458, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8631606101989746, + "num_tokens": 144262232.0, + "step": 3781 + }, + { + "epoch": 0.4811092736293093, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.1604108810424805, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.836400032043457, + "num_tokens": 144297144.0, + "step": 3782 + }, + { + "epoch": 0.48123648390789975, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.2367892265319824, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8583142757415771, + "num_tokens": 144336078.0, + "step": 3783 + }, + { + "epoch": 0.4813636941864903, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9396792650222778, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8499894142150879, + "num_tokens": 144380162.0, + "step": 3784 + }, + { + "epoch": 0.4814909044650808, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9616751670837402, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8446931838989258, + "num_tokens": 144416735.0, + "step": 3785 + }, + { + "epoch": 0.4816181147436713, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.0137252807617188, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8470335602760315, + "num_tokens": 144452780.0, + "step": 3786 + }, + { + "epoch": 0.4817453250222618, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.6233413219451904, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8623343706130981, + "num_tokens": 144494155.0, + "step": 3787 + }, + { + "epoch": 0.48187253530085233, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8082408905029297, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8671551942825317, + "num_tokens": 144533989.0, + "step": 3788 + }, + { + "epoch": 0.4819997455794428, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8541697263717651, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8539734482765198, + "num_tokens": 144577409.0, + "step": 3789 + }, + { + "epoch": 0.48212695585803333, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9132426977157593, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8489559888839722, + "num_tokens": 144618501.0, + "step": 3790 + }, + { + "epoch": 0.48225416613662386, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9446440935134888, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8639692664146423, + "num_tokens": 144659454.0, + "step": 3791 + }, + { + "epoch": 0.48238137641521434, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.7919889688491821, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8638410568237305, + "num_tokens": 144695580.0, + "step": 3792 + }, + { + "epoch": 0.48250858669380486, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8563344478607178, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8399229049682617, + "num_tokens": 144735586.0, + "step": 3793 + }, + { + "epoch": 0.4826357969723954, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.6993969678878784, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8591129779815674, + "num_tokens": 144776371.0, + "step": 3794 + }, + { + "epoch": 0.48276300725098586, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.426933526992798, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8518804311752319, + "num_tokens": 144816724.0, + "step": 3795 + }, + { + "epoch": 0.4828902175295764, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8530939817428589, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8623872995376587, + "num_tokens": 144854591.0, + "step": 3796 + }, + { + "epoch": 0.4830174278081669, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8670625686645508, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.858119785785675, + "num_tokens": 144893694.0, + "step": 3797 + }, + { + "epoch": 0.4831446380867574, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8251787424087524, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8547539114952087, + "num_tokens": 144932985.0, + "step": 3798 + }, + { + "epoch": 0.4832718483653479, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9049562215805054, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8586876392364502, + "num_tokens": 144967823.0, + "step": 3799 + }, + { + "epoch": 0.48339905864393845, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7762092351913452, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8779277205467224, + "num_tokens": 145007750.0, + "step": 3800 + }, + { + "epoch": 0.4835262689225289, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.6623103618621826, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8516947031021118, + "num_tokens": 145048596.0, + "step": 3801 + }, + { + "epoch": 0.48365347920111945, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.0457732677459717, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.847611665725708, + "num_tokens": 145080868.0, + "step": 3802 + }, + { + "epoch": 0.48378068947971, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.9526783227920532, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8654521703720093, + "num_tokens": 145115865.0, + "step": 3803 + }, + { + "epoch": 0.48390789975830045, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7104941606521606, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.870410144329071, + "num_tokens": 145151748.0, + "step": 3804 + }, + { + "epoch": 0.484035110036891, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.816938042640686, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8561484813690186, + "num_tokens": 145187074.0, + "step": 3805 + }, + { + "epoch": 0.4841623203154815, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8320261240005493, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8767578601837158, + "num_tokens": 145224855.0, + "step": 3806 + }, + { + "epoch": 0.484289530594072, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7396154403686523, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8661285638809204, + "num_tokens": 145266538.0, + "step": 3807 + }, + { + "epoch": 0.4844167408726625, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9102485179901123, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8638738393783569, + "num_tokens": 145303560.0, + "step": 3808 + }, + { + "epoch": 0.48454395115125304, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.842256784439087, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8588228225708008, + "num_tokens": 145345985.0, + "step": 3809 + }, + { + "epoch": 0.4846711614298435, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.3669586181640625, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.846619725227356, + "num_tokens": 145382132.0, + "step": 3810 + }, + { + "epoch": 0.48479837170843404, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9890121221542358, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8483251333236694, + "num_tokens": 145418992.0, + "step": 3811 + }, + { + "epoch": 0.48492558198702457, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.0207784175872803, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.856669008731842, + "num_tokens": 145455892.0, + "step": 3812 + }, + { + "epoch": 0.48505279226561504, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.237985610961914, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8562896251678467, + "num_tokens": 145497217.0, + "step": 3813 + }, + { + "epoch": 0.48518000254420557, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8075511455535889, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8643777370452881, + "num_tokens": 145533822.0, + "step": 3814 + }, + { + "epoch": 0.4853072128227961, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.839296579360962, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8772568702697754, + "num_tokens": 145572967.0, + "step": 3815 + }, + { + "epoch": 0.48543442310138657, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7935733795166016, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8724987506866455, + "num_tokens": 145609436.0, + "step": 3816 + }, + { + "epoch": 0.4855616333799771, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9856138229370117, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8693602085113525, + "num_tokens": 145645083.0, + "step": 3817 + }, + { + "epoch": 0.4856888436585676, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9922438859939575, + "learning_rate": 1e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8397294282913208, + "num_tokens": 145680249.0, + "step": 3818 + }, + { + "epoch": 0.4858160539371581, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.880053997039795, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8626392483711243, + "num_tokens": 145718569.0, + "step": 3819 + }, + { + "epoch": 0.4859432642157486, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.1190991401672363, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8417723178863525, + "num_tokens": 145751868.0, + "step": 3820 + }, + { + "epoch": 0.48607047449433916, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9391403198242188, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8616059422492981, + "num_tokens": 145787881.0, + "step": 3821 + }, + { + "epoch": 0.48619768477292963, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.881800651550293, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8643932342529297, + "num_tokens": 145824010.0, + "step": 3822 + }, + { + "epoch": 0.48632489505152016, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9259958267211914, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8663206696510315, + "num_tokens": 145860981.0, + "step": 3823 + }, + { + "epoch": 0.4864521053301107, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9347273111343384, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.862360954284668, + "num_tokens": 145900687.0, + "step": 3824 + }, + { + "epoch": 0.48657931560870116, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.4459609985351562, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8491758108139038, + "num_tokens": 145938888.0, + "step": 3825 + }, + { + "epoch": 0.4867065258872917, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9562432765960693, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8539403080940247, + "num_tokens": 145987060.0, + "step": 3826 + }, + { + "epoch": 0.4868337361658822, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.954918622970581, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8525996208190918, + "num_tokens": 146026447.0, + "step": 3827 + }, + { + "epoch": 0.4869609464444727, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.764723777770996, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8605136871337891, + "num_tokens": 146067099.0, + "step": 3828 + }, + { + "epoch": 0.4870881567230632, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.725003719329834, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8455575108528137, + "num_tokens": 146110767.0, + "step": 3829 + }, + { + "epoch": 0.48721536700165374, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.818585753440857, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8499233722686768, + "num_tokens": 146147603.0, + "step": 3830 + }, + { + "epoch": 0.48734257728024427, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.822594165802002, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8685781359672546, + "num_tokens": 146188821.0, + "step": 3831 + }, + { + "epoch": 0.48746978755883474, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8421621322631836, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8542920351028442, + "num_tokens": 146231886.0, + "step": 3832 + }, + { + "epoch": 0.48759699783742527, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.820075511932373, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8346477150917053, + "num_tokens": 146271965.0, + "step": 3833 + }, + { + "epoch": 0.4877242081160158, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.912845492362976, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8447645306587219, + "num_tokens": 146316686.0, + "step": 3834 + }, + { + "epoch": 0.4878514183946063, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.924698829650879, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.847846508026123, + "num_tokens": 146355098.0, + "step": 3835 + }, + { + "epoch": 0.4879786286731968, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9711670875549316, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.863139271736145, + "num_tokens": 146390666.0, + "step": 3836 + }, + { + "epoch": 0.48810583895178733, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9162169694900513, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8654444217681885, + "num_tokens": 146426740.0, + "step": 3837 + }, + { + "epoch": 0.4882330492303778, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.132164239883423, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8484327793121338, + "num_tokens": 146462668.0, + "step": 3838 + }, + { + "epoch": 0.48836025950896833, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.984958291053772, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8509029150009155, + "num_tokens": 146500160.0, + "step": 3839 + }, + { + "epoch": 0.48848746978755886, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8506280183792114, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8530752658843994, + "num_tokens": 146542963.0, + "step": 3840 + }, + { + "epoch": 0.48861468006614933, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8529655933380127, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.863782525062561, + "num_tokens": 146580649.0, + "step": 3841 + }, + { + "epoch": 0.48874189034473986, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.1800670623779297, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8641790151596069, + "num_tokens": 146620823.0, + "step": 3842 + }, + { + "epoch": 0.4888691006233304, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7506338357925415, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8683274984359741, + "num_tokens": 146663700.0, + "step": 3843 + }, + { + "epoch": 0.48899631090192086, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8087159395217896, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8525136709213257, + "num_tokens": 146698211.0, + "step": 3844 + }, + { + "epoch": 0.4891235211805114, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8790743350982666, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8563541173934937, + "num_tokens": 146738380.0, + "step": 3845 + }, + { + "epoch": 0.4892507314591019, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8305778503417969, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8603053092956543, + "num_tokens": 146775413.0, + "step": 3846 + }, + { + "epoch": 0.4893779417376924, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8129364252090454, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.860733151435852, + "num_tokens": 146814774.0, + "step": 3847 + }, + { + "epoch": 0.4895051520162829, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.787811517715454, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.868661642074585, + "num_tokens": 146853957.0, + "step": 3848 + }, + { + "epoch": 0.48963236229487345, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.0388455390930176, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8498235940933228, + "num_tokens": 146889060.0, + "step": 3849 + }, + { + "epoch": 0.4897595725734639, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.876057744026184, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.868180513381958, + "num_tokens": 146929849.0, + "step": 3850 + }, + { + "epoch": 0.48988678285205445, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8589271306991577, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8588669300079346, + "num_tokens": 146968376.0, + "step": 3851 + }, + { + "epoch": 0.490013993130645, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.1515891551971436, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8420165777206421, + "num_tokens": 147002871.0, + "step": 3852 + }, + { + "epoch": 0.49014120340923545, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.0167338848114014, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8531377911567688, + "num_tokens": 147039412.0, + "step": 3853 + }, + { + "epoch": 0.490268413687826, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8328124284744263, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8611550331115723, + "num_tokens": 147080307.0, + "step": 3854 + }, + { + "epoch": 0.4903956239664165, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8674834966659546, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.864439845085144, + "num_tokens": 147120131.0, + "step": 3855 + }, + { + "epoch": 0.490522834245007, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.2373130321502686, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8509430289268494, + "num_tokens": 147155760.0, + "step": 3856 + }, + { + "epoch": 0.4906500445235975, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.832044005393982, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8629089593887329, + "num_tokens": 147195406.0, + "step": 3857 + }, + { + "epoch": 0.49077725480218803, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.9947404861450195, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8709661960601807, + "num_tokens": 147235688.0, + "step": 3858 + }, + { + "epoch": 0.4909044650807785, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8667007684707642, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8661587238311768, + "num_tokens": 147271575.0, + "step": 3859 + }, + { + "epoch": 0.49103167535936904, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.7606401443481445, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.854897141456604, + "num_tokens": 147315473.0, + "step": 3860 + }, + { + "epoch": 0.49115888563795956, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.6873254776000977, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8625342845916748, + "num_tokens": 147359058.0, + "step": 3861 + }, + { + "epoch": 0.49128609591655004, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.1411001682281494, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8629622459411621, + "num_tokens": 147394443.0, + "step": 3862 + }, + { + "epoch": 0.49141330619514056, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8603914976119995, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.853531002998352, + "num_tokens": 147436225.0, + "step": 3863 + }, + { + "epoch": 0.4915405164737311, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8606947660446167, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8687673211097717, + "num_tokens": 147473449.0, + "step": 3864 + }, + { + "epoch": 0.49166772675232157, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.837444543838501, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8558703660964966, + "num_tokens": 147510330.0, + "step": 3865 + }, + { + "epoch": 0.4917949370309121, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.893128514289856, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8696763515472412, + "num_tokens": 147544683.0, + "step": 3866 + }, + { + "epoch": 0.4919221473095026, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9017512798309326, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.876186728477478, + "num_tokens": 147589330.0, + "step": 3867 + }, + { + "epoch": 0.4920493575880931, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.943965196609497, + "learning_rate": 1e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8278043866157532, + "num_tokens": 147626864.0, + "step": 3868 + }, + { + "epoch": 0.4921765678666836, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7251033782958984, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8409143090248108, + "num_tokens": 147672006.0, + "step": 3869 + }, + { + "epoch": 0.49230377814527415, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.6178032159805298, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8492193222045898, + "num_tokens": 147717388.0, + "step": 3870 + }, + { + "epoch": 0.4924309884238646, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.8943326473236084, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8465621471405029, + "num_tokens": 147759827.0, + "step": 3871 + }, + { + "epoch": 0.49255819870245515, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.1019997596740723, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8390889167785645, + "num_tokens": 147793742.0, + "step": 3872 + }, + { + "epoch": 0.4926854089810457, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.9483789205551147, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8683222532272339, + "num_tokens": 147829881.0, + "step": 3873 + }, + { + "epoch": 0.49281261925963615, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7921706438064575, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8428686857223511, + "num_tokens": 147875051.0, + "step": 3874 + }, + { + "epoch": 0.4929398295382267, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.8762166500091553, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8503010272979736, + "num_tokens": 147912786.0, + "step": 3875 + }, + { + "epoch": 0.4930670398168172, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.768601655960083, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8366830945014954, + "num_tokens": 147954981.0, + "step": 3876 + }, + { + "epoch": 0.4931942500954077, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.9293125867843628, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8502664566040039, + "num_tokens": 147992179.0, + "step": 3877 + }, + { + "epoch": 0.4933214603739982, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8848352432250977, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8623522520065308, + "num_tokens": 148029866.0, + "step": 3878 + }, + { + "epoch": 0.49344867065258874, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8855232000350952, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8577232360839844, + "num_tokens": 148070057.0, + "step": 3879 + }, + { + "epoch": 0.4935758809311792, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.9091695547103882, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8625441193580627, + "num_tokens": 148108461.0, + "step": 3880 + }, + { + "epoch": 0.49370309120976974, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.853455662727356, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8516839146614075, + "num_tokens": 148147609.0, + "step": 3881 + }, + { + "epoch": 0.49383030148836027, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.7668408155441284, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.852620542049408, + "num_tokens": 148190475.0, + "step": 3882 + }, + { + "epoch": 0.4939575117669508, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.03155517578125, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8699030876159668, + "num_tokens": 148221187.0, + "step": 3883 + }, + { + "epoch": 0.49408472204554127, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.730684518814087, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8662963509559631, + "num_tokens": 148260916.0, + "step": 3884 + }, + { + "epoch": 0.4942119323241318, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.929001808166504, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8638942241668701, + "num_tokens": 148295591.0, + "step": 3885 + }, + { + "epoch": 0.4943391426027223, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.9306917190551758, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8526654243469238, + "num_tokens": 148333445.0, + "step": 3886 + }, + { + "epoch": 0.4944663528813128, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.9762427806854248, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8664118051528931, + "num_tokens": 148367951.0, + "step": 3887 + }, + { + "epoch": 0.4945935631599033, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8962080478668213, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.850245475769043, + "num_tokens": 148408491.0, + "step": 3888 + }, + { + "epoch": 0.49472077343849385, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.4578230381011963, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8576928377151489, + "num_tokens": 148443883.0, + "step": 3889 + }, + { + "epoch": 0.4948479837170843, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.5554747581481934, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8600835800170898, + "num_tokens": 148480511.0, + "step": 3890 + }, + { + "epoch": 0.49497519399567486, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.024381160736084, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8521808981895447, + "num_tokens": 148514759.0, + "step": 3891 + }, + { + "epoch": 0.4951024042742654, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.7221423387527466, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8598856925964355, + "num_tokens": 148553209.0, + "step": 3892 + }, + { + "epoch": 0.49522961455285586, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.898057460784912, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8490614295005798, + "num_tokens": 148590387.0, + "step": 3893 + }, + { + "epoch": 0.4953568248314464, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.015218734741211, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8559281826019287, + "num_tokens": 148625201.0, + "step": 3894 + }, + { + "epoch": 0.4954840351100369, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8906903266906738, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8606006503105164, + "num_tokens": 148665989.0, + "step": 3895 + }, + { + "epoch": 0.4956112453886274, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.893378496170044, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.861707329750061, + "num_tokens": 148704839.0, + "step": 3896 + }, + { + "epoch": 0.4957384556672179, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8350344896316528, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8680242300033569, + "num_tokens": 148740523.0, + "step": 3897 + }, + { + "epoch": 0.49586566594580844, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.806946039199829, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8726376295089722, + "num_tokens": 148779103.0, + "step": 3898 + }, + { + "epoch": 0.4959928762243989, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8086578845977783, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8691353797912598, + "num_tokens": 148814659.0, + "step": 3899 + }, + { + "epoch": 0.49612008650298944, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.8390032052993774, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8789284229278564, + "num_tokens": 148856864.0, + "step": 3900 + }, + { + "epoch": 0.49624729678157997, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.959425687789917, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.864254891872406, + "num_tokens": 148890309.0, + "step": 3901 + }, + { + "epoch": 0.49637450706017044, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.9687308073043823, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8442590236663818, + "num_tokens": 148926662.0, + "step": 3902 + }, + { + "epoch": 0.496501717338761, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.003995180130005, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8457462787628174, + "num_tokens": 148963032.0, + "step": 3903 + }, + { + "epoch": 0.4966289276173515, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.057860851287842, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8643240332603455, + "num_tokens": 148996684.0, + "step": 3904 + }, + { + "epoch": 0.496756137895942, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8443690538406372, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8570883870124817, + "num_tokens": 149032735.0, + "step": 3905 + }, + { + "epoch": 0.4968833481745325, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.1907732486724854, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8593658208847046, + "num_tokens": 149069415.0, + "step": 3906 + }, + { + "epoch": 0.49701055845312303, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8352028131484985, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8444129228591919, + "num_tokens": 149115042.0, + "step": 3907 + }, + { + "epoch": 0.4971377687317135, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.312242031097412, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8666776418685913, + "num_tokens": 149149767.0, + "step": 3908 + }, + { + "epoch": 0.49726497901030403, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8758342266082764, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8515939712524414, + "num_tokens": 149193000.0, + "step": 3909 + }, + { + "epoch": 0.49739218928889456, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9450000524520874, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.854515016078949, + "num_tokens": 149226963.0, + "step": 3910 + }, + { + "epoch": 0.49751939956748503, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.7946093082427979, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8532146215438843, + "num_tokens": 149268139.0, + "step": 3911 + }, + { + "epoch": 0.49764660984607556, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9594225883483887, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.865405797958374, + "num_tokens": 149305882.0, + "step": 3912 + }, + { + "epoch": 0.4977738201246661, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8538819551467896, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8543609380722046, + "num_tokens": 149344119.0, + "step": 3913 + }, + { + "epoch": 0.49790103040325656, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9024569988250732, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.84870845079422, + "num_tokens": 149384102.0, + "step": 3914 + }, + { + "epoch": 0.4980282406818471, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9549803733825684, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8553709983825684, + "num_tokens": 149425677.0, + "step": 3915 + }, + { + "epoch": 0.4981554509604376, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.0427191257476807, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8522838354110718, + "num_tokens": 149463275.0, + "step": 3916 + }, + { + "epoch": 0.4982826612390281, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.7717983722686768, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8734052181243896, + "num_tokens": 149503950.0, + "step": 3917 + }, + { + "epoch": 0.4984098715176186, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8578462600708008, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8495006561279297, + "num_tokens": 149541192.0, + "step": 3918 + }, + { + "epoch": 0.49853708179620915, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.8839865922927856, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.856694221496582, + "num_tokens": 149581784.0, + "step": 3919 + }, + { + "epoch": 0.4986642920747996, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.8753365278244019, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8623086214065552, + "num_tokens": 149619312.0, + "step": 3920 + }, + { + "epoch": 0.49879150235339015, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.9994843006134033, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8548049926757812, + "num_tokens": 149657443.0, + "step": 3921 + }, + { + "epoch": 0.4989187126319807, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.8601937294006348, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8701587915420532, + "num_tokens": 149694992.0, + "step": 3922 + }, + { + "epoch": 0.49904592291057115, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8872886896133423, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8539485931396484, + "num_tokens": 149729554.0, + "step": 3923 + }, + { + "epoch": 0.4991731331891617, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.8557485342025757, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.864234209060669, + "num_tokens": 149770051.0, + "step": 3924 + }, + { + "epoch": 0.4993003434677522, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.1748099327087402, + "learning_rate": 1e-06, + "loss": 0.5509, + "mean_token_accuracy": 0.8273441791534424, + "num_tokens": 149806724.0, + "step": 3925 + }, + { + "epoch": 0.4994275537463427, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 16.600183486938477, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8495793342590332, + "num_tokens": 149849874.0, + "step": 3926 + }, + { + "epoch": 0.4995547640249332, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9609739780426025, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8564325571060181, + "num_tokens": 149888076.0, + "step": 3927 + }, + { + "epoch": 0.49968197430352373, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9438644647598267, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8580028414726257, + "num_tokens": 149923750.0, + "step": 3928 + }, + { + "epoch": 0.4998091845821142, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.0844898223876953, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8456969261169434, + "num_tokens": 149960657.0, + "step": 3929 + }, + { + "epoch": 0.49993639486070474, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.956516146659851, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8728650808334351, + "num_tokens": 149995983.0, + "step": 3930 + }, + { + "epoch": 0.5000636051392953, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.7759603261947632, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8370187878608704, + "num_tokens": 150037508.0, + "step": 3931 + }, + { + "epoch": 0.5001908154178858, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.9244931936264038, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8528420925140381, + "num_tokens": 150077806.0, + "step": 3932 + }, + { + "epoch": 0.5003180256964763, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.93488347530365, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.855640172958374, + "num_tokens": 150117589.0, + "step": 3933 + }, + { + "epoch": 0.5004452359750667, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8502248525619507, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8511505722999573, + "num_tokens": 150161579.0, + "step": 3934 + }, + { + "epoch": 0.5005724462536573, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.252899408340454, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8442674875259399, + "num_tokens": 150208018.0, + "step": 3935 + }, + { + "epoch": 0.5006996565322478, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.835286021232605, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8584315776824951, + "num_tokens": 150250164.0, + "step": 3936 + }, + { + "epoch": 0.5008268668108383, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 4.174783706665039, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8768160343170166, + "num_tokens": 150289972.0, + "step": 3937 + }, + { + "epoch": 0.5009540770894289, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.9135842323303223, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8554952144622803, + "num_tokens": 150333656.0, + "step": 3938 + }, + { + "epoch": 0.5010812873680194, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.7173353433609009, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8710970878601074, + "num_tokens": 150374086.0, + "step": 3939 + }, + { + "epoch": 0.5012084976466098, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.805552363395691, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8613630533218384, + "num_tokens": 150407472.0, + "step": 3940 + }, + { + "epoch": 0.5013357079252003, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8107165098190308, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8635236024856567, + "num_tokens": 150444560.0, + "step": 3941 + }, + { + "epoch": 0.5014629182037909, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8814078569412231, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8628024458885193, + "num_tokens": 150483621.0, + "step": 3942 + }, + { + "epoch": 0.5015901284823814, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 1.8273292779922485, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8577154874801636, + "num_tokens": 150520414.0, + "step": 3943 + }, + { + "epoch": 0.5017173387609719, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 16.606626510620117, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8494061231613159, + "num_tokens": 150556589.0, + "step": 3944 + }, + { + "epoch": 0.5018445490395624, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.0000178813934326, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8553783297538757, + "num_tokens": 150592393.0, + "step": 3945 + }, + { + "epoch": 0.5019717593181529, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.0495309829711914, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.843934178352356, + "num_tokens": 150630457.0, + "step": 3946 + }, + { + "epoch": 0.5020989695967434, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.1176934242248535, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.865237832069397, + "num_tokens": 150659482.0, + "step": 3947 + }, + { + "epoch": 0.5022261798753339, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.9478154182434082, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8569084405899048, + "num_tokens": 150700920.0, + "step": 3948 + }, + { + "epoch": 0.5023533901539244, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.7659475803375244, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8683662414550781, + "num_tokens": 150736393.0, + "step": 3949 + }, + { + "epoch": 0.502480600432515, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.7458906173706055, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8536174297332764, + "num_tokens": 150783848.0, + "step": 3950 + }, + { + "epoch": 0.5026078107111055, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.7859301567077637, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8547928333282471, + "num_tokens": 150824179.0, + "step": 3951 + }, + { + "epoch": 0.5027350209896959, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8704469203948975, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8583850860595703, + "num_tokens": 150862775.0, + "step": 3952 + }, + { + "epoch": 0.5028622312682864, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.9213210344314575, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8572714328765869, + "num_tokens": 150900005.0, + "step": 3953 + }, + { + "epoch": 0.502989441546877, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.881050705909729, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8545103073120117, + "num_tokens": 150936148.0, + "step": 3954 + }, + { + "epoch": 0.5031166518254675, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8864850997924805, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8550641536712646, + "num_tokens": 150971661.0, + "step": 3955 + }, + { + "epoch": 0.503243862104058, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.2329530715942383, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8681536316871643, + "num_tokens": 151009521.0, + "step": 3956 + }, + { + "epoch": 0.5033710723826486, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.9023762941360474, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8541213274002075, + "num_tokens": 151053641.0, + "step": 3957 + }, + { + "epoch": 0.5034982826612391, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8151499032974243, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8619619607925415, + "num_tokens": 151094476.0, + "step": 3958 + }, + { + "epoch": 0.5036254929398295, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.9709668159484863, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8513245582580566, + "num_tokens": 151126381.0, + "step": 3959 + }, + { + "epoch": 0.50375270321842, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.092921495437622, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8623227477073669, + "num_tokens": 151156875.0, + "step": 3960 + }, + { + "epoch": 0.5038799134970106, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8554797172546387, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8538395166397095, + "num_tokens": 151194394.0, + "step": 3961 + }, + { + "epoch": 0.5040071237756011, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8158900737762451, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8442112803459167, + "num_tokens": 151234472.0, + "step": 3962 + }, + { + "epoch": 0.5041343340541916, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.1543116569519043, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8526960015296936, + "num_tokens": 151270426.0, + "step": 3963 + }, + { + "epoch": 0.5042615443327821, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.133725643157959, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8563134670257568, + "num_tokens": 151305793.0, + "step": 3964 + }, + { + "epoch": 0.5043887546113726, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9508503675460815, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8453688621520996, + "num_tokens": 151345116.0, + "step": 3965 + }, + { + "epoch": 0.5045159648899631, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8529064655303955, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.861992359161377, + "num_tokens": 151381901.0, + "step": 3966 + }, + { + "epoch": 0.5046431751685536, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.7356617450714111, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8595074415206909, + "num_tokens": 151426884.0, + "step": 3967 + }, + { + "epoch": 0.5047703854471441, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9277191162109375, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8504688143730164, + "num_tokens": 151467596.0, + "step": 3968 + }, + { + "epoch": 0.5048975957257347, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8376493453979492, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8767631649971008, + "num_tokens": 151502131.0, + "step": 3969 + }, + { + "epoch": 0.5050248060043252, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.062758207321167, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8532899618148804, + "num_tokens": 151538932.0, + "step": 3970 + }, + { + "epoch": 0.5051520162829156, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9546890258789062, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8566845655441284, + "num_tokens": 151577839.0, + "step": 3971 + }, + { + "epoch": 0.5052792265615061, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.791015386581421, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.857036828994751, + "num_tokens": 151618984.0, + "step": 3972 + }, + { + "epoch": 0.5054064368400967, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8677603006362915, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8575623035430908, + "num_tokens": 151658392.0, + "step": 3973 + }, + { + "epoch": 0.5055336471186872, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9595496654510498, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8681277632713318, + "num_tokens": 151689781.0, + "step": 3974 + }, + { + "epoch": 0.5056608573972777, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.2002511024475098, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8437853455543518, + "num_tokens": 151729612.0, + "step": 3975 + }, + { + "epoch": 0.5057880676758683, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8872473239898682, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8447985649108887, + "num_tokens": 151769270.0, + "step": 3976 + }, + { + "epoch": 0.5059152779544587, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.1314878463745117, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.839693546295166, + "num_tokens": 151803936.0, + "step": 3977 + }, + { + "epoch": 0.5060424882330492, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8876879215240479, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8452106714248657, + "num_tokens": 151840429.0, + "step": 3978 + }, + { + "epoch": 0.5061696985116397, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7661256790161133, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8590532541275024, + "num_tokens": 151883296.0, + "step": 3979 + }, + { + "epoch": 0.5062969087902303, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.736151099205017, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8726720809936523, + "num_tokens": 151917658.0, + "step": 3980 + }, + { + "epoch": 0.5064241190688208, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.7830262184143066, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8570375442504883, + "num_tokens": 151958822.0, + "step": 3981 + }, + { + "epoch": 0.5065513293474113, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8740803003311157, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8525200486183167, + "num_tokens": 151994964.0, + "step": 3982 + }, + { + "epoch": 0.5066785396260017, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.7816234827041626, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8520652055740356, + "num_tokens": 152037013.0, + "step": 3983 + }, + { + "epoch": 0.5068057499045923, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8963240385055542, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8539527654647827, + "num_tokens": 152074821.0, + "step": 3984 + }, + { + "epoch": 0.5069329601831828, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.7559317350387573, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8487988710403442, + "num_tokens": 152116667.0, + "step": 3985 + }, + { + "epoch": 0.5070601704617733, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8501179218292236, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8614997863769531, + "num_tokens": 152152714.0, + "step": 3986 + }, + { + "epoch": 0.5071873807403638, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8465046882629395, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8439629673957825, + "num_tokens": 152190230.0, + "step": 3987 + }, + { + "epoch": 0.5073145910189544, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.847549319267273, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8564375638961792, + "num_tokens": 152228887.0, + "step": 3988 + }, + { + "epoch": 0.5074418012975448, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.209555149078369, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8659837245941162, + "num_tokens": 152271863.0, + "step": 3989 + }, + { + "epoch": 0.5075690115761353, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9077818393707275, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8530137538909912, + "num_tokens": 152310765.0, + "step": 3990 + }, + { + "epoch": 0.5076962218547258, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.8868358135223389, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8796757459640503, + "num_tokens": 152342545.0, + "step": 3991 + }, + { + "epoch": 0.5078234321333164, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.12196946144104, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8402848839759827, + "num_tokens": 152386223.0, + "step": 3992 + }, + { + "epoch": 0.5079506424119069, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.928916096687317, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8627270460128784, + "num_tokens": 152421811.0, + "step": 3993 + }, + { + "epoch": 0.5080778526904974, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 2.0740344524383545, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8454341888427734, + "num_tokens": 152454581.0, + "step": 3994 + }, + { + "epoch": 0.5082050629690879, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 2.380117177963257, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8475885987281799, + "num_tokens": 152490591.0, + "step": 3995 + }, + { + "epoch": 0.5083322732476784, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.846987009048462, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8799789547920227, + "num_tokens": 152532290.0, + "step": 3996 + }, + { + "epoch": 0.5084594835262689, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.7746003866195679, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.862007737159729, + "num_tokens": 152571984.0, + "step": 3997 + }, + { + "epoch": 0.5085866938048594, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7954484224319458, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8577197790145874, + "num_tokens": 152611082.0, + "step": 3998 + }, + { + "epoch": 0.50871390408345, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7032322883605957, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8650769591331482, + "num_tokens": 152655303.0, + "step": 3999 + }, + { + "epoch": 0.5088411143620405, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.111525297164917, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8570045232772827, + "num_tokens": 152688432.0, + "step": 4000 + }, + { + "epoch": 0.5089683246406309, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.1060707569122314, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8523073196411133, + "num_tokens": 152725637.0, + "step": 4001 + }, + { + "epoch": 0.5090955349192214, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.612823009490967, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8559483289718628, + "num_tokens": 152763108.0, + "step": 4002 + }, + { + "epoch": 0.509222745197812, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.828024983406067, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8638752698898315, + "num_tokens": 152800659.0, + "step": 4003 + }, + { + "epoch": 0.5093499554764025, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8087552785873413, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8471881151199341, + "num_tokens": 152842797.0, + "step": 4004 + }, + { + "epoch": 0.509477165754993, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 3.047391176223755, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8490902781486511, + "num_tokens": 152888257.0, + "step": 4005 + }, + { + "epoch": 0.5096043760335836, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9443771839141846, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.854905366897583, + "num_tokens": 152929089.0, + "step": 4006 + }, + { + "epoch": 0.5097315863121741, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8330740928649902, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8515114784240723, + "num_tokens": 152972710.0, + "step": 4007 + }, + { + "epoch": 0.5098587965907645, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.876806378364563, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8578216433525085, + "num_tokens": 153008867.0, + "step": 4008 + }, + { + "epoch": 0.509986006869355, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8795342445373535, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8515198826789856, + "num_tokens": 153051497.0, + "step": 4009 + }, + { + "epoch": 0.5101132171479456, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8493971824645996, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8722798824310303, + "num_tokens": 153089295.0, + "step": 4010 + }, + { + "epoch": 0.5102404274265361, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.528683662414551, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8624109029769897, + "num_tokens": 153123957.0, + "step": 4011 + }, + { + "epoch": 0.5103676377051266, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8615378141403198, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8543277382850647, + "num_tokens": 153165248.0, + "step": 4012 + }, + { + "epoch": 0.5104948479837171, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8683149814605713, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8715634346008301, + "num_tokens": 153206163.0, + "step": 4013 + }, + { + "epoch": 0.5106220582623076, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.7855689525604248, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8489335775375366, + "num_tokens": 153248666.0, + "step": 4014 + }, + { + "epoch": 0.5107492685408981, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8080226182937622, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8799868822097778, + "num_tokens": 153289009.0, + "step": 4015 + }, + { + "epoch": 0.5108764788194886, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.9237676858901978, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8619126081466675, + "num_tokens": 153321354.0, + "step": 4016 + }, + { + "epoch": 0.5110036890980791, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9112118482589722, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8572254776954651, + "num_tokens": 153361061.0, + "step": 4017 + }, + { + "epoch": 0.5111308993766697, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.8988091945648193, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8509209156036377, + "num_tokens": 153397002.0, + "step": 4018 + }, + { + "epoch": 0.5112581096552602, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.855594277381897, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8441562056541443, + "num_tokens": 153435264.0, + "step": 4019 + }, + { + "epoch": 0.5113853199338506, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 1.8939591646194458, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8561564683914185, + "num_tokens": 153474751.0, + "step": 4020 + }, + { + "epoch": 0.5115125302124411, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.9346011877059937, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8661803007125854, + "num_tokens": 153510084.0, + "step": 4021 + }, + { + "epoch": 0.5116397404910317, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.8286885023117065, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8437390327453613, + "num_tokens": 153550387.0, + "step": 4022 + }, + { + "epoch": 0.5117669507696222, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.7745450735092163, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8788363933563232, + "num_tokens": 153592441.0, + "step": 4023 + }, + { + "epoch": 0.5118941610482127, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.308570623397827, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8631832599639893, + "num_tokens": 153629202.0, + "step": 4024 + }, + { + "epoch": 0.5120213713268033, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9171288013458252, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.851516604423523, + "num_tokens": 153669243.0, + "step": 4025 + }, + { + "epoch": 0.5121485816053937, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.844112515449524, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8526320457458496, + "num_tokens": 153709206.0, + "step": 4026 + }, + { + "epoch": 0.5122757918839842, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.1390631198883057, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8649214506149292, + "num_tokens": 153741251.0, + "step": 4027 + }, + { + "epoch": 0.5124030021625747, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.0205841064453125, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8438631296157837, + "num_tokens": 153775559.0, + "step": 4028 + }, + { + "epoch": 0.5125302124411653, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8039584159851074, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8615444898605347, + "num_tokens": 153819158.0, + "step": 4029 + }, + { + "epoch": 0.5126574227197558, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9047435522079468, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8519015312194824, + "num_tokens": 153856231.0, + "step": 4030 + }, + { + "epoch": 0.5127846329983463, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.777226209640503, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8675262928009033, + "num_tokens": 153897125.0, + "step": 4031 + }, + { + "epoch": 0.5129118432769367, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.897510290145874, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.865105152130127, + "num_tokens": 153933552.0, + "step": 4032 + }, + { + "epoch": 0.5130390535555273, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8515750169754028, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8430173993110657, + "num_tokens": 153974337.0, + "step": 4033 + }, + { + "epoch": 0.5131662638341178, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9308449029922485, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8514723777770996, + "num_tokens": 154018911.0, + "step": 4034 + }, + { + "epoch": 0.5132934741127083, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8592745065689087, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8673845529556274, + "num_tokens": 154056441.0, + "step": 4035 + }, + { + "epoch": 0.5134206843912988, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8225340843200684, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8474526405334473, + "num_tokens": 154096326.0, + "step": 4036 + }, + { + "epoch": 0.5135478946698894, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.7597485780715942, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.861473798751831, + "num_tokens": 154137223.0, + "step": 4037 + }, + { + "epoch": 0.5136751049484798, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.124014139175415, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8556686639785767, + "num_tokens": 154168312.0, + "step": 4038 + }, + { + "epoch": 0.5138023152270703, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8087917566299438, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.870037317276001, + "num_tokens": 154203125.0, + "step": 4039 + }, + { + "epoch": 0.5139295255056608, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.953926682472229, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8504223227500916, + "num_tokens": 154248437.0, + "step": 4040 + }, + { + "epoch": 0.5140567357842514, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8458280563354492, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.861400842666626, + "num_tokens": 154286303.0, + "step": 4041 + }, + { + "epoch": 0.5141839460628419, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8456226587295532, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8483966588973999, + "num_tokens": 154328638.0, + "step": 4042 + }, + { + "epoch": 0.5143111563414324, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.021028995513916, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8560971021652222, + "num_tokens": 154362953.0, + "step": 4043 + }, + { + "epoch": 0.5144383666200228, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9182238578796387, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8454432487487793, + "num_tokens": 154401289.0, + "step": 4044 + }, + { + "epoch": 0.5145655768986134, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.821007251739502, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8616725206375122, + "num_tokens": 154438779.0, + "step": 4045 + }, + { + "epoch": 0.5146927871772039, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9983654022216797, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8514734506607056, + "num_tokens": 154467032.0, + "step": 4046 + }, + { + "epoch": 0.5148199974557944, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7630115747451782, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8665156364440918, + "num_tokens": 154505222.0, + "step": 4047 + }, + { + "epoch": 0.514947207734385, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.059546709060669, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8444150686264038, + "num_tokens": 154544953.0, + "step": 4048 + }, + { + "epoch": 0.5150744180129755, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.7890714406967163, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.868928074836731, + "num_tokens": 154586193.0, + "step": 4049 + }, + { + "epoch": 0.5152016282915659, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8411533832550049, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8607707023620605, + "num_tokens": 154629531.0, + "step": 4050 + }, + { + "epoch": 0.5153288385701564, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.0028672218322754, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8730738162994385, + "num_tokens": 154659482.0, + "step": 4051 + }, + { + "epoch": 0.515456048848747, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.7811307907104492, + "learning_rate": 1e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.8238608837127686, + "num_tokens": 154702645.0, + "step": 4052 + }, + { + "epoch": 0.5155832591273375, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.7483197450637817, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8665865659713745, + "num_tokens": 154743329.0, + "step": 4053 + }, + { + "epoch": 0.515710469405928, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8836476802825928, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8538192510604858, + "num_tokens": 154783460.0, + "step": 4054 + }, + { + "epoch": 0.5158376796845185, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.883070707321167, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8514405488967896, + "num_tokens": 154827209.0, + "step": 4055 + }, + { + "epoch": 0.5159648899631091, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9142178297042847, + "learning_rate": 1e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8330036401748657, + "num_tokens": 154866577.0, + "step": 4056 + }, + { + "epoch": 0.5160921002416995, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.7214802503585815, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.859197735786438, + "num_tokens": 154912191.0, + "step": 4057 + }, + { + "epoch": 0.51621931052029, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9646497964859009, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8440943360328674, + "num_tokens": 154953412.0, + "step": 4058 + }, + { + "epoch": 0.5163465207988805, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.827677845954895, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8673081398010254, + "num_tokens": 154989913.0, + "step": 4059 + }, + { + "epoch": 0.5164737310774711, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.6659433841705322, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8442546129226685, + "num_tokens": 155038094.0, + "step": 4060 + }, + { + "epoch": 0.5166009413560616, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.835923194885254, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8479882478713989, + "num_tokens": 155075900.0, + "step": 4061 + }, + { + "epoch": 0.5167281516346521, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.7999296188354492, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8564407825469971, + "num_tokens": 155117038.0, + "step": 4062 + }, + { + "epoch": 0.5168553619132426, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.0208077430725098, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8724920749664307, + "num_tokens": 155151442.0, + "step": 4063 + }, + { + "epoch": 0.5169825721918331, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.758101463317871, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8657389283180237, + "num_tokens": 155192288.0, + "step": 4064 + }, + { + "epoch": 0.5171097824704236, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.6124560832977295, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8600156307220459, + "num_tokens": 155238931.0, + "step": 4065 + }, + { + "epoch": 0.5172369927490141, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.788206934928894, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8707864284515381, + "num_tokens": 155276644.0, + "step": 4066 + }, + { + "epoch": 0.5173642030276047, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9437001943588257, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8695905208587646, + "num_tokens": 155307638.0, + "step": 4067 + }, + { + "epoch": 0.5174914133061952, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.0261242389678955, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8507512807846069, + "num_tokens": 155343255.0, + "step": 4068 + }, + { + "epoch": 0.5176186235847856, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9095826148986816, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8585355281829834, + "num_tokens": 155382283.0, + "step": 4069 + }, + { + "epoch": 0.5177458338633761, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9322036504745483, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8344067931175232, + "num_tokens": 155427096.0, + "step": 4070 + }, + { + "epoch": 0.5178730441419667, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8749511241912842, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8460973501205444, + "num_tokens": 155467472.0, + "step": 4071 + }, + { + "epoch": 0.5180002544205572, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9130799770355225, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8516998291015625, + "num_tokens": 155508350.0, + "step": 4072 + }, + { + "epoch": 0.5181274646991477, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.6901241540908813, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8795251846313477, + "num_tokens": 155547215.0, + "step": 4073 + }, + { + "epoch": 0.5182546749777382, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.7735844850540161, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8615465760231018, + "num_tokens": 155586658.0, + "step": 4074 + }, + { + "epoch": 0.5183818852563287, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8821567296981812, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.859504222869873, + "num_tokens": 155620944.0, + "step": 4075 + }, + { + "epoch": 0.5185090955349192, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.894967794418335, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8551128506660461, + "num_tokens": 155662699.0, + "step": 4076 + }, + { + "epoch": 0.5186363058135097, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.76661217212677, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8452116250991821, + "num_tokens": 155707783.0, + "step": 4077 + }, + { + "epoch": 0.5187635160921003, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8811200857162476, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8634681701660156, + "num_tokens": 155745227.0, + "step": 4078 + }, + { + "epoch": 0.5188907263706908, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.7247846126556396, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8592237234115601, + "num_tokens": 155786466.0, + "step": 4079 + }, + { + "epoch": 0.5190179366492813, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8598500490188599, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8586864471435547, + "num_tokens": 155833145.0, + "step": 4080 + }, + { + "epoch": 0.5191451469278717, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8883451223373413, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8476669788360596, + "num_tokens": 155869533.0, + "step": 4081 + }, + { + "epoch": 0.5192723572064623, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9908390045166016, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8554590344429016, + "num_tokens": 155909329.0, + "step": 4082 + }, + { + "epoch": 0.5193995674850528, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.7258784770965576, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8654217720031738, + "num_tokens": 155947848.0, + "step": 4083 + }, + { + "epoch": 0.5195267777636433, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.0269241333007812, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8618537187576294, + "num_tokens": 155982693.0, + "step": 4084 + }, + { + "epoch": 0.5196539880422338, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9412795305252075, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8559864163398743, + "num_tokens": 156023414.0, + "step": 4085 + }, + { + "epoch": 0.5197811983208244, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.739863634109497, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8569177985191345, + "num_tokens": 156063488.0, + "step": 4086 + }, + { + "epoch": 0.5199084085994148, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7657088041305542, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8575812578201294, + "num_tokens": 156103081.0, + "step": 4087 + }, + { + "epoch": 0.5200356188780053, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.1137802600860596, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8497498631477356, + "num_tokens": 156139453.0, + "step": 4088 + }, + { + "epoch": 0.5201628291565958, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.0106253623962402, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8512591123580933, + "num_tokens": 156176564.0, + "step": 4089 + }, + { + "epoch": 0.5202900394351864, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8476097583770752, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8610105514526367, + "num_tokens": 156221278.0, + "step": 4090 + }, + { + "epoch": 0.5204172497137769, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.9549715518951416, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8617329597473145, + "num_tokens": 156259124.0, + "step": 4091 + }, + { + "epoch": 0.5205444599923674, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9831353425979614, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8465781211853027, + "num_tokens": 156297832.0, + "step": 4092 + }, + { + "epoch": 0.5206716702709578, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.564061403274536, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.863115668296814, + "num_tokens": 156329745.0, + "step": 4093 + }, + { + "epoch": 0.5207988805495484, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.0783286094665527, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8571429252624512, + "num_tokens": 156360154.0, + "step": 4094 + }, + { + "epoch": 0.5209260908281389, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.7696446180343628, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8605032563209534, + "num_tokens": 156399360.0, + "step": 4095 + }, + { + "epoch": 0.5210533011067294, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.5234594345092773, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8559451103210449, + "num_tokens": 156438782.0, + "step": 4096 + }, + { + "epoch": 0.52118051138532, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8884990215301514, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8645409345626831, + "num_tokens": 156470261.0, + "step": 4097 + }, + { + "epoch": 0.5213077216639105, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.9609354734420776, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8398290872573853, + "num_tokens": 156510493.0, + "step": 4098 + }, + { + "epoch": 0.5214349319425009, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.8954901695251465, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8597237467765808, + "num_tokens": 156545040.0, + "step": 4099 + }, + { + "epoch": 0.5215621422210914, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8649142980575562, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8555172681808472, + "num_tokens": 156583474.0, + "step": 4100 + }, + { + "epoch": 0.521689352499682, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.740420937538147, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8570712804794312, + "num_tokens": 156627137.0, + "step": 4101 + }, + { + "epoch": 0.5218165627782725, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.1733875274658203, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8468029499053955, + "num_tokens": 156664726.0, + "step": 4102 + }, + { + "epoch": 0.521943773056863, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7602375745773315, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8747130632400513, + "num_tokens": 156707014.0, + "step": 4103 + }, + { + "epoch": 0.5220709833354535, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8167815208435059, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8548897504806519, + "num_tokens": 156747784.0, + "step": 4104 + }, + { + "epoch": 0.522198193614044, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.850594162940979, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8550889492034912, + "num_tokens": 156784760.0, + "step": 4105 + }, + { + "epoch": 0.5223254038926345, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7659491300582886, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8551858067512512, + "num_tokens": 156822899.0, + "step": 4106 + }, + { + "epoch": 0.522452614171225, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7810256481170654, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8647505044937134, + "num_tokens": 156861308.0, + "step": 4107 + }, + { + "epoch": 0.5225798244498155, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7796534299850464, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8637208938598633, + "num_tokens": 156901981.0, + "step": 4108 + }, + { + "epoch": 0.5227070347284061, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.8112865686416626, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8627470135688782, + "num_tokens": 156942215.0, + "step": 4109 + }, + { + "epoch": 0.5228342450069966, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.84199857711792, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.864641547203064, + "num_tokens": 156983820.0, + "step": 4110 + }, + { + "epoch": 0.5229614552855871, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.911739706993103, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8698408007621765, + "num_tokens": 157016215.0, + "step": 4111 + }, + { + "epoch": 0.5230886655641775, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.9119757413864136, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8573185205459595, + "num_tokens": 157052071.0, + "step": 4112 + }, + { + "epoch": 0.5232158758427681, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.8390061855316162, + "learning_rate": 1e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8368427753448486, + "num_tokens": 157097711.0, + "step": 4113 + }, + { + "epoch": 0.5233430861213586, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 2.039496898651123, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.856197714805603, + "num_tokens": 157134801.0, + "step": 4114 + }, + { + "epoch": 0.5234702963999491, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.824818730354309, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8580700755119324, + "num_tokens": 157171885.0, + "step": 4115 + }, + { + "epoch": 0.5235975066785397, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.8162132501602173, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8567169904708862, + "num_tokens": 157211915.0, + "step": 4116 + }, + { + "epoch": 0.5237247169571302, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.6954141855239868, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8513519763946533, + "num_tokens": 157260564.0, + "step": 4117 + }, + { + "epoch": 0.5238519272357206, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.956606149673462, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8598569631576538, + "num_tokens": 157294676.0, + "step": 4118 + }, + { + "epoch": 0.5239791375143111, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.7685582637786865, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8671849370002747, + "num_tokens": 157334607.0, + "step": 4119 + }, + { + "epoch": 0.5241063477929017, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.9688749313354492, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.859016478061676, + "num_tokens": 157372021.0, + "step": 4120 + }, + { + "epoch": 0.5242335580714922, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.8934849500656128, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8625746369361877, + "num_tokens": 157410092.0, + "step": 4121 + }, + { + "epoch": 0.5243607683500827, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8181167840957642, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8547086715698242, + "num_tokens": 157447979.0, + "step": 4122 + }, + { + "epoch": 0.5244879786286732, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 2.0074944496154785, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8522313833236694, + "num_tokens": 157481376.0, + "step": 4123 + }, + { + "epoch": 0.5246151889072637, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.998823642730713, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8634741306304932, + "num_tokens": 157519555.0, + "step": 4124 + }, + { + "epoch": 0.5247423991858542, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8845621347427368, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8561019897460938, + "num_tokens": 157551849.0, + "step": 4125 + }, + { + "epoch": 0.5248696094644447, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7583285570144653, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8576422929763794, + "num_tokens": 157590823.0, + "step": 4126 + }, + { + "epoch": 0.5249968197430352, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.875986933708191, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8679680824279785, + "num_tokens": 157633771.0, + "step": 4127 + }, + { + "epoch": 0.5251240300216258, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.124680995941162, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8701964616775513, + "num_tokens": 157667796.0, + "step": 4128 + }, + { + "epoch": 0.5252512403002163, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.729485273361206, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8683443069458008, + "num_tokens": 157709452.0, + "step": 4129 + }, + { + "epoch": 0.5253784505788067, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0001440048217773, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8484097719192505, + "num_tokens": 157741407.0, + "step": 4130 + }, + { + "epoch": 0.5255056608573972, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.810012698173523, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8686625957489014, + "num_tokens": 157777693.0, + "step": 4131 + }, + { + "epoch": 0.5256328711359878, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.926656723022461, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8555872440338135, + "num_tokens": 157814729.0, + "step": 4132 + }, + { + "epoch": 0.5257600814145783, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8122142553329468, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8688321113586426, + "num_tokens": 157853698.0, + "step": 4133 + }, + { + "epoch": 0.5258872916931688, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0088069438934326, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8633232712745667, + "num_tokens": 157888580.0, + "step": 4134 + }, + { + "epoch": 0.5260145019717594, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7083793878555298, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8563989400863647, + "num_tokens": 157928434.0, + "step": 4135 + }, + { + "epoch": 0.5261417122503498, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8321113586425781, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8536630868911743, + "num_tokens": 157968936.0, + "step": 4136 + }, + { + "epoch": 0.5262689225289403, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0549824237823486, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8465337157249451, + "num_tokens": 158005496.0, + "step": 4137 + }, + { + "epoch": 0.5263961328075308, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.804158329963684, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8654504418373108, + "num_tokens": 158047569.0, + "step": 4138 + }, + { + "epoch": 0.5265233430861214, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9034595489501953, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8578751087188721, + "num_tokens": 158085733.0, + "step": 4139 + }, + { + "epoch": 0.5266505533647119, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8884543180465698, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8473361730575562, + "num_tokens": 158123906.0, + "step": 4140 + }, + { + "epoch": 0.5267777636433024, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8478327989578247, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8629727959632874, + "num_tokens": 158163663.0, + "step": 4141 + }, + { + "epoch": 0.5269049739218928, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8243058919906616, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8458806276321411, + "num_tokens": 158205878.0, + "step": 4142 + }, + { + "epoch": 0.5270321842004834, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9742318391799927, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8606956601142883, + "num_tokens": 158241608.0, + "step": 4143 + }, + { + "epoch": 0.5271593944790739, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.4391374588012695, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8696173429489136, + "num_tokens": 158274259.0, + "step": 4144 + }, + { + "epoch": 0.5272866047576644, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9771168231964111, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8648590445518494, + "num_tokens": 158304793.0, + "step": 4145 + }, + { + "epoch": 0.527413815036255, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.2230687141418457, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8706419467926025, + "num_tokens": 158349420.0, + "step": 4146 + }, + { + "epoch": 0.5275410253148455, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7596834897994995, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8572766780853271, + "num_tokens": 158395834.0, + "step": 4147 + }, + { + "epoch": 0.5276682355934359, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.1370646953582764, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8601706027984619, + "num_tokens": 158435568.0, + "step": 4148 + }, + { + "epoch": 0.5277954458720264, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7961490154266357, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8678620457649231, + "num_tokens": 158473584.0, + "step": 4149 + }, + { + "epoch": 0.527922656150617, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7631340026855469, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8598468899726868, + "num_tokens": 158513435.0, + "step": 4150 + }, + { + "epoch": 0.5280498664292075, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8343344926834106, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.858847439289093, + "num_tokens": 158549478.0, + "step": 4151 + }, + { + "epoch": 0.528177076707798, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.584297776222229, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8624018430709839, + "num_tokens": 158598742.0, + "step": 4152 + }, + { + "epoch": 0.5283042869863885, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.9485713243484497, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8470946550369263, + "num_tokens": 158636806.0, + "step": 4153 + }, + { + "epoch": 0.528431497264979, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9178191423416138, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8508255481719971, + "num_tokens": 158676043.0, + "step": 4154 + }, + { + "epoch": 0.5285587075435695, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.3501455783843994, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8435569405555725, + "num_tokens": 158711772.0, + "step": 4155 + }, + { + "epoch": 0.52868591782216, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.1939048767089844, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8563464283943176, + "num_tokens": 158749608.0, + "step": 4156 + }, + { + "epoch": 0.5288131281007505, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.1857874393463135, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8595185875892639, + "num_tokens": 158790502.0, + "step": 4157 + }, + { + "epoch": 0.5289403383793411, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9743906259536743, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8350173830986023, + "num_tokens": 158828415.0, + "step": 4158 + }, + { + "epoch": 0.5290675486579316, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9794013500213623, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8486540913581848, + "num_tokens": 158866972.0, + "step": 4159 + }, + { + "epoch": 0.5291947589365221, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.852306604385376, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8554667830467224, + "num_tokens": 158902991.0, + "step": 4160 + }, + { + "epoch": 0.5293219692151125, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.6992813348770142, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8660011291503906, + "num_tokens": 158945963.0, + "step": 4161 + }, + { + "epoch": 0.5294491794937031, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.979562520980835, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8569830060005188, + "num_tokens": 158985407.0, + "step": 4162 + }, + { + "epoch": 0.5295763897722936, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 2.0842976570129395, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8507232666015625, + "num_tokens": 159024387.0, + "step": 4163 + }, + { + "epoch": 0.5297036000508841, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9484840631484985, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.848739504814148, + "num_tokens": 159061430.0, + "step": 4164 + }, + { + "epoch": 0.5298308103294747, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8790037631988525, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8587589263916016, + "num_tokens": 159099350.0, + "step": 4165 + }, + { + "epoch": 0.5299580206080652, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9387078285217285, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.832442045211792, + "num_tokens": 159136276.0, + "step": 4166 + }, + { + "epoch": 0.5300852308866556, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.225994348526001, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8800343871116638, + "num_tokens": 159177102.0, + "step": 4167 + }, + { + "epoch": 0.5302124411652461, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0095701217651367, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8688598871231079, + "num_tokens": 159213582.0, + "step": 4168 + }, + { + "epoch": 0.5303396514438367, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.823285460472107, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8563655018806458, + "num_tokens": 159254283.0, + "step": 4169 + }, + { + "epoch": 0.5304668617224272, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.017671823501587, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8582663536071777, + "num_tokens": 159288557.0, + "step": 4170 + }, + { + "epoch": 0.5305940720010177, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8291223049163818, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8663181662559509, + "num_tokens": 159325047.0, + "step": 4171 + }, + { + "epoch": 0.5307212822796082, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9649296998977661, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8609873652458191, + "num_tokens": 159366291.0, + "step": 4172 + }, + { + "epoch": 0.5308484925581987, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.029799699783325, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8505995869636536, + "num_tokens": 159401634.0, + "step": 4173 + }, + { + "epoch": 0.5309757028367892, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.007638692855835, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8495863676071167, + "num_tokens": 159435316.0, + "step": 4174 + }, + { + "epoch": 0.5311029131153797, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9130364656448364, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.851359486579895, + "num_tokens": 159472192.0, + "step": 4175 + }, + { + "epoch": 0.5312301233939702, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8295414447784424, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8646345138549805, + "num_tokens": 159515342.0, + "step": 4176 + }, + { + "epoch": 0.5313573336725608, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8611481189727783, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8624192476272583, + "num_tokens": 159553932.0, + "step": 4177 + }, + { + "epoch": 0.5314845439511513, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9952138662338257, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8588972687721252, + "num_tokens": 159584711.0, + "step": 4178 + }, + { + "epoch": 0.5316117542297417, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0932958126068115, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8447534441947937, + "num_tokens": 159618784.0, + "step": 4179 + }, + { + "epoch": 0.5317389645083322, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0574100017547607, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8555339574813843, + "num_tokens": 159661288.0, + "step": 4180 + }, + { + "epoch": 0.5318661747869228, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.780745029449463, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8693243265151978, + "num_tokens": 159701286.0, + "step": 4181 + }, + { + "epoch": 0.5319933850655133, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7159620523452759, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8460819721221924, + "num_tokens": 159746631.0, + "step": 4182 + }, + { + "epoch": 0.5321205953441038, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9108531475067139, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8610721230506897, + "num_tokens": 159787753.0, + "step": 4183 + }, + { + "epoch": 0.5322478056226944, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9432861804962158, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8496729135513306, + "num_tokens": 159823092.0, + "step": 4184 + }, + { + "epoch": 0.5323750159012848, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.3625247478485107, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8489209413528442, + "num_tokens": 159858480.0, + "step": 4185 + }, + { + "epoch": 0.5325022261798753, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9723126888275146, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8487027287483215, + "num_tokens": 159896220.0, + "step": 4186 + }, + { + "epoch": 0.5326294364584658, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9860731363296509, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8566084504127502, + "num_tokens": 159930995.0, + "step": 4187 + }, + { + "epoch": 0.5327566467370564, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.1979706287384033, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8525269627571106, + "num_tokens": 159968013.0, + "step": 4188 + }, + { + "epoch": 0.5328838570156469, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.957718849182129, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8513844609260559, + "num_tokens": 160004085.0, + "step": 4189 + }, + { + "epoch": 0.5330110672942374, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8131896257400513, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8682565689086914, + "num_tokens": 160040606.0, + "step": 4190 + }, + { + "epoch": 0.5331382775728278, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7968746423721313, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8691046833992004, + "num_tokens": 160080792.0, + "step": 4191 + }, + { + "epoch": 0.5332654878514184, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.800659418106079, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8457057476043701, + "num_tokens": 160117055.0, + "step": 4192 + }, + { + "epoch": 0.5333926981300089, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9270954132080078, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8632704019546509, + "num_tokens": 160153223.0, + "step": 4193 + }, + { + "epoch": 0.5335199084085994, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9331889152526855, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.840034008026123, + "num_tokens": 160190920.0, + "step": 4194 + }, + { + "epoch": 0.53364711868719, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0302703380584717, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8555241823196411, + "num_tokens": 160221898.0, + "step": 4195 + }, + { + "epoch": 0.5337743289657805, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.848540186882019, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8601226806640625, + "num_tokens": 160267102.0, + "step": 4196 + }, + { + "epoch": 0.5339015392443709, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8226120471954346, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.86275315284729, + "num_tokens": 160304422.0, + "step": 4197 + }, + { + "epoch": 0.5340287495229614, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9421360492706299, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8544886112213135, + "num_tokens": 160339006.0, + "step": 4198 + }, + { + "epoch": 0.534155959801552, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7127034664154053, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8772926330566406, + "num_tokens": 160377504.0, + "step": 4199 + }, + { + "epoch": 0.5342831700801425, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.94345223903656, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8376587629318237, + "num_tokens": 160413315.0, + "step": 4200 + }, + { + "epoch": 0.534410380358733, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8051456212997437, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8721208572387695, + "num_tokens": 160455034.0, + "step": 4201 + }, + { + "epoch": 0.5345375906373235, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0507020950317383, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.852645218372345, + "num_tokens": 160490505.0, + "step": 4202 + }, + { + "epoch": 0.534664800915914, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7792022228240967, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8675632476806641, + "num_tokens": 160528712.0, + "step": 4203 + }, + { + "epoch": 0.5347920111945045, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8615845441818237, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8501348495483398, + "num_tokens": 160567945.0, + "step": 4204 + }, + { + "epoch": 0.534919221473095, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.6710498332977295, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8640813827514648, + "num_tokens": 160596767.0, + "step": 4205 + }, + { + "epoch": 0.5350464317516855, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7961167097091675, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8550083041191101, + "num_tokens": 160636912.0, + "step": 4206 + }, + { + "epoch": 0.5351736420302761, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.895171046257019, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8522657155990601, + "num_tokens": 160674588.0, + "step": 4207 + }, + { + "epoch": 0.5353008523088666, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.2336277961730957, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8474549651145935, + "num_tokens": 160713441.0, + "step": 4208 + }, + { + "epoch": 0.5354280625874571, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.931588888168335, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.867370069026947, + "num_tokens": 160752716.0, + "step": 4209 + }, + { + "epoch": 0.5355552728660475, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7535501718521118, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8681933283805847, + "num_tokens": 160798771.0, + "step": 4210 + }, + { + "epoch": 0.5356824831446381, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.606820821762085, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8750214576721191, + "num_tokens": 160843085.0, + "step": 4211 + }, + { + "epoch": 0.5358096934232286, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9597457647323608, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8504826426506042, + "num_tokens": 160883073.0, + "step": 4212 + }, + { + "epoch": 0.5359369037018191, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8494040966033936, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8643707633018494, + "num_tokens": 160923734.0, + "step": 4213 + }, + { + "epoch": 0.5360641139804097, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9089492559432983, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8554643392562866, + "num_tokens": 160963460.0, + "step": 4214 + }, + { + "epoch": 0.5361913242590002, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8541109561920166, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8571618795394897, + "num_tokens": 161003691.0, + "step": 4215 + }, + { + "epoch": 0.5363185345375906, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8915717601776123, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8588097095489502, + "num_tokens": 161042056.0, + "step": 4216 + }, + { + "epoch": 0.5364457448161811, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9014791250228882, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8817940354347229, + "num_tokens": 161078670.0, + "step": 4217 + }, + { + "epoch": 0.5365729550947717, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.048720359802246, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.872103750705719, + "num_tokens": 161113992.0, + "step": 4218 + }, + { + "epoch": 0.5367001653733622, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7045814990997314, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8636339902877808, + "num_tokens": 161153763.0, + "step": 4219 + }, + { + "epoch": 0.5368273756519527, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.003140449523926, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8423561453819275, + "num_tokens": 161184186.0, + "step": 4220 + }, + { + "epoch": 0.5369545859305432, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0602612495422363, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8558589220046997, + "num_tokens": 161219298.0, + "step": 4221 + }, + { + "epoch": 0.5370817962091337, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9611728191375732, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8739851713180542, + "num_tokens": 161255252.0, + "step": 4222 + }, + { + "epoch": 0.5372090064877242, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9148828983306885, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8627917766571045, + "num_tokens": 161295890.0, + "step": 4223 + }, + { + "epoch": 0.5373362167663147, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0959055423736572, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8557947278022766, + "num_tokens": 161330808.0, + "step": 4224 + }, + { + "epoch": 0.5374634270449052, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9337021112442017, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8650019764900208, + "num_tokens": 161370515.0, + "step": 4225 + }, + { + "epoch": 0.5375906373234958, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.035712242126465, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.871694803237915, + "num_tokens": 161405257.0, + "step": 4226 + }, + { + "epoch": 0.5377178476020863, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9236112833023071, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8588330149650574, + "num_tokens": 161443526.0, + "step": 4227 + }, + { + "epoch": 0.5378450578806767, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.001978874206543, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8519201874732971, + "num_tokens": 161481262.0, + "step": 4228 + }, + { + "epoch": 0.5379722681592672, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.4363491535186768, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8440403342247009, + "num_tokens": 161517756.0, + "step": 4229 + }, + { + "epoch": 0.5380994784378578, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9504754543304443, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8676535487174988, + "num_tokens": 161552680.0, + "step": 4230 + }, + { + "epoch": 0.5382266887164483, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.1427254676818848, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8557008504867554, + "num_tokens": 161588607.0, + "step": 4231 + }, + { + "epoch": 0.5383538989950388, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9614713191986084, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8410636186599731, + "num_tokens": 161626427.0, + "step": 4232 + }, + { + "epoch": 0.5384811092736294, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8750606775283813, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8642727136611938, + "num_tokens": 161665182.0, + "step": 4233 + }, + { + "epoch": 0.5386083195522198, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.883582353591919, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8583582043647766, + "num_tokens": 161704685.0, + "step": 4234 + }, + { + "epoch": 0.5387355298308103, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7768242359161377, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8639755249023438, + "num_tokens": 161742960.0, + "step": 4235 + }, + { + "epoch": 0.5388627401094008, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0017528533935547, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.857209324836731, + "num_tokens": 161778199.0, + "step": 4236 + }, + { + "epoch": 0.5389899503879914, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8941715955734253, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8553622961044312, + "num_tokens": 161811235.0, + "step": 4237 + }, + { + "epoch": 0.5391171606665819, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8957178592681885, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8592262268066406, + "num_tokens": 161847669.0, + "step": 4238 + }, + { + "epoch": 0.5392443709451724, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0113511085510254, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8447278141975403, + "num_tokens": 161888470.0, + "step": 4239 + }, + { + "epoch": 0.5393715812237628, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.093217611312866, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8399491310119629, + "num_tokens": 161924563.0, + "step": 4240 + }, + { + "epoch": 0.5394987915023534, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9586923122406006, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8451548218727112, + "num_tokens": 161959062.0, + "step": 4241 + }, + { + "epoch": 0.5396260017809439, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.902750849723816, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8631698489189148, + "num_tokens": 161994755.0, + "step": 4242 + }, + { + "epoch": 0.5397532120595344, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.78181791305542, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8680065274238586, + "num_tokens": 162032812.0, + "step": 4243 + }, + { + "epoch": 0.5398804223381249, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.2652499675750732, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8615248203277588, + "num_tokens": 162067130.0, + "step": 4244 + }, + { + "epoch": 0.5400076326167155, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 16.600830078125, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8607586622238159, + "num_tokens": 162107452.0, + "step": 4245 + }, + { + "epoch": 0.5401348428953059, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.967085599899292, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8595335483551025, + "num_tokens": 162144220.0, + "step": 4246 + }, + { + "epoch": 0.5402620531738964, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.413116931915283, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8672064542770386, + "num_tokens": 162181798.0, + "step": 4247 + }, + { + "epoch": 0.540389263452487, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.891656756401062, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8515267372131348, + "num_tokens": 162222501.0, + "step": 4248 + }, + { + "epoch": 0.5405164737310775, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7504578828811646, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.854440450668335, + "num_tokens": 162265533.0, + "step": 4249 + }, + { + "epoch": 0.540643684009668, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8302818536758423, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8741266131401062, + "num_tokens": 162304984.0, + "step": 4250 + }, + { + "epoch": 0.5407708942882585, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9634524583816528, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8502658009529114, + "num_tokens": 162342358.0, + "step": 4251 + }, + { + "epoch": 0.540898104566849, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9189107418060303, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8587214946746826, + "num_tokens": 162377343.0, + "step": 4252 + }, + { + "epoch": 0.5410253148454395, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9026544094085693, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8597089648246765, + "num_tokens": 162416416.0, + "step": 4253 + }, + { + "epoch": 0.54115252512403, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0160791873931885, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8833036422729492, + "num_tokens": 162447108.0, + "step": 4254 + }, + { + "epoch": 0.5412797354026205, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.983176589012146, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.860965371131897, + "num_tokens": 162489659.0, + "step": 4255 + }, + { + "epoch": 0.5414069456812111, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9909802675247192, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8482537269592285, + "num_tokens": 162525816.0, + "step": 4256 + }, + { + "epoch": 0.5415341559598016, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.1024868488311768, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8550390601158142, + "num_tokens": 162564154.0, + "step": 4257 + }, + { + "epoch": 0.5416613662383921, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.866143822669983, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8534614443778992, + "num_tokens": 162602728.0, + "step": 4258 + }, + { + "epoch": 0.5417885765169825, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.139979124069214, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8623350262641907, + "num_tokens": 162636659.0, + "step": 4259 + }, + { + "epoch": 0.5419157867955731, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8930901288986206, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8743513226509094, + "num_tokens": 162676639.0, + "step": 4260 + }, + { + "epoch": 0.5420429970741636, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8340702056884766, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8405317068099976, + "num_tokens": 162716763.0, + "step": 4261 + }, + { + "epoch": 0.5421702073527541, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.890909194946289, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8481923341751099, + "num_tokens": 162753943.0, + "step": 4262 + }, + { + "epoch": 0.5422974176313446, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9531539678573608, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8539053201675415, + "num_tokens": 162792938.0, + "step": 4263 + }, + { + "epoch": 0.5424246279099352, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8948513269424438, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8563598394393921, + "num_tokens": 162838938.0, + "step": 4264 + }, + { + "epoch": 0.5425518381885256, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8969148397445679, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8644354343414307, + "num_tokens": 162875026.0, + "step": 4265 + }, + { + "epoch": 0.5426790484671161, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.197040557861328, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8623558878898621, + "num_tokens": 162910724.0, + "step": 4266 + }, + { + "epoch": 0.5428062587457066, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.095759630203247, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8614983558654785, + "num_tokens": 162941602.0, + "step": 4267 + }, + { + "epoch": 0.5429334690242972, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0137481689453125, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8461978435516357, + "num_tokens": 162980760.0, + "step": 4268 + }, + { + "epoch": 0.5430606793028877, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9007539749145508, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8691710233688354, + "num_tokens": 163016155.0, + "step": 4269 + }, + { + "epoch": 0.5431878895814782, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0577621459960938, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8500914573669434, + "num_tokens": 163052519.0, + "step": 4270 + }, + { + "epoch": 0.5433150998600687, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9557639360427856, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8683423399925232, + "num_tokens": 163091806.0, + "step": 4271 + }, + { + "epoch": 0.5434423101386592, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.890503168106079, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8745444416999817, + "num_tokens": 163128830.0, + "step": 4272 + }, + { + "epoch": 0.5435695204172497, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8206185102462769, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8533146381378174, + "num_tokens": 163169567.0, + "step": 4273 + }, + { + "epoch": 0.5436967306958402, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0259475708007812, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8522512316703796, + "num_tokens": 163202913.0, + "step": 4274 + }, + { + "epoch": 0.5438239409744308, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9206717014312744, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8663879036903381, + "num_tokens": 163236741.0, + "step": 4275 + }, + { + "epoch": 0.5439511512530213, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7402504682540894, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8535501956939697, + "num_tokens": 163281402.0, + "step": 4276 + }, + { + "epoch": 0.5440783615316117, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.850074052810669, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8674793243408203, + "num_tokens": 163325288.0, + "step": 4277 + }, + { + "epoch": 0.5442055718102022, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.4207303524017334, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8491503000259399, + "num_tokens": 163367471.0, + "step": 4278 + }, + { + "epoch": 0.5443327820887928, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9417507648468018, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8600935935974121, + "num_tokens": 163400077.0, + "step": 4279 + }, + { + "epoch": 0.5444599923673833, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8119308948516846, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8620613813400269, + "num_tokens": 163433739.0, + "step": 4280 + }, + { + "epoch": 0.5445872026459738, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8957669734954834, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8593457937240601, + "num_tokens": 163467447.0, + "step": 4281 + }, + { + "epoch": 0.5447144129245644, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.912935495376587, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8555907607078552, + "num_tokens": 163506633.0, + "step": 4282 + }, + { + "epoch": 0.5448416232031548, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.042405843734741, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8433343172073364, + "num_tokens": 163542773.0, + "step": 4283 + }, + { + "epoch": 0.5449688334817453, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9468841552734375, + "learning_rate": 1e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8260847330093384, + "num_tokens": 163585247.0, + "step": 4284 + }, + { + "epoch": 0.5450960437603358, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.8899857997894287, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8462793827056885, + "num_tokens": 163622802.0, + "step": 4285 + }, + { + "epoch": 0.5452232540389264, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.154169797897339, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8827304244041443, + "num_tokens": 163655980.0, + "step": 4286 + }, + { + "epoch": 0.5453504643175169, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8946449756622314, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8639950752258301, + "num_tokens": 163691494.0, + "step": 4287 + }, + { + "epoch": 0.5454776745961074, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9789245128631592, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8705122470855713, + "num_tokens": 163725169.0, + "step": 4288 + }, + { + "epoch": 0.5456048848746978, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8568366765975952, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8618165850639343, + "num_tokens": 163766050.0, + "step": 4289 + }, + { + "epoch": 0.5457320951532884, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.017711639404297, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8524026870727539, + "num_tokens": 163799559.0, + "step": 4290 + }, + { + "epoch": 0.5458593054318789, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9473698139190674, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8526424169540405, + "num_tokens": 163837052.0, + "step": 4291 + }, + { + "epoch": 0.5459865157104694, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 16.609403610229492, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8569039106369019, + "num_tokens": 163872494.0, + "step": 4292 + }, + { + "epoch": 0.5461137259890599, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9270811080932617, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.850069522857666, + "num_tokens": 163913499.0, + "step": 4293 + }, + { + "epoch": 0.5462409362676505, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.786794662475586, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8401545286178589, + "num_tokens": 163957965.0, + "step": 4294 + }, + { + "epoch": 0.5463681465462409, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8041930198669434, + "learning_rate": 1e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8186516761779785, + "num_tokens": 164005661.0, + "step": 4295 + }, + { + "epoch": 0.5464953568248314, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7559583187103271, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8734631538391113, + "num_tokens": 164044717.0, + "step": 4296 + }, + { + "epoch": 0.5466225671034219, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.2152864933013916, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8583459854125977, + "num_tokens": 164080022.0, + "step": 4297 + }, + { + "epoch": 0.5467497773820125, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0145211219787598, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8647928237915039, + "num_tokens": 164112749.0, + "step": 4298 + }, + { + "epoch": 0.546876987660603, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.012979745864868, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8536170125007629, + "num_tokens": 164152517.0, + "step": 4299 + }, + { + "epoch": 0.5470041979391935, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.954642653465271, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8652197122573853, + "num_tokens": 164195245.0, + "step": 4300 + }, + { + "epoch": 0.5471314082177839, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8755513429641724, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.85345458984375, + "num_tokens": 164230643.0, + "step": 4301 + }, + { + "epoch": 0.5472586184963745, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7920507192611694, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8547534346580505, + "num_tokens": 164271089.0, + "step": 4302 + }, + { + "epoch": 0.547385828774965, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9190834760665894, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8572187423706055, + "num_tokens": 164310618.0, + "step": 4303 + }, + { + "epoch": 0.5475130390535555, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.955246090888977, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8456252813339233, + "num_tokens": 164348630.0, + "step": 4304 + }, + { + "epoch": 0.5476402493321461, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9259848594665527, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8658902645111084, + "num_tokens": 164383735.0, + "step": 4305 + }, + { + "epoch": 0.5477674596107366, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.7934006452560425, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8592990040779114, + "num_tokens": 164426115.0, + "step": 4306 + }, + { + "epoch": 0.5478946698893271, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9162843227386475, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8508902788162231, + "num_tokens": 164462119.0, + "step": 4307 + }, + { + "epoch": 0.5480218801679175, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8418015241622925, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8555383086204529, + "num_tokens": 164503030.0, + "step": 4308 + }, + { + "epoch": 0.5481490904465081, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.0088114738464355, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.860131025314331, + "num_tokens": 164541091.0, + "step": 4309 + }, + { + "epoch": 0.5482763007250986, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8580577373504639, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8528965711593628, + "num_tokens": 164577678.0, + "step": 4310 + }, + { + "epoch": 0.5484035110036891, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9808114767074585, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8560366034507751, + "num_tokens": 164612779.0, + "step": 4311 + }, + { + "epoch": 0.5485307212822796, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8539018630981445, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8640395998954773, + "num_tokens": 164649849.0, + "step": 4312 + }, + { + "epoch": 0.5486579315608702, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8665151596069336, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8549271821975708, + "num_tokens": 164690538.0, + "step": 4313 + }, + { + "epoch": 0.5487851418394606, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7559200525283813, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8580179214477539, + "num_tokens": 164729338.0, + "step": 4314 + }, + { + "epoch": 0.5489123521180511, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9405128955841064, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8694171905517578, + "num_tokens": 164766648.0, + "step": 4315 + }, + { + "epoch": 0.5490395623966416, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8658945560455322, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8573734760284424, + "num_tokens": 164804254.0, + "step": 4316 + }, + { + "epoch": 0.5491667726752322, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8988213539123535, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8720198273658752, + "num_tokens": 164839943.0, + "step": 4317 + }, + { + "epoch": 0.5492939829538227, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.821076512336731, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8512610197067261, + "num_tokens": 164887057.0, + "step": 4318 + }, + { + "epoch": 0.5494211932324132, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8684123754501343, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.858053982257843, + "num_tokens": 164928515.0, + "step": 4319 + }, + { + "epoch": 0.5495484035110036, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8423045873641968, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8573179244995117, + "num_tokens": 164963818.0, + "step": 4320 + }, + { + "epoch": 0.5496756137895942, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.1198675632476807, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.872181236743927, + "num_tokens": 165001091.0, + "step": 4321 + }, + { + "epoch": 0.5498028240681847, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0091562271118164, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8405020236968994, + "num_tokens": 165046009.0, + "step": 4322 + }, + { + "epoch": 0.5499300343467752, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9319862127304077, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8558728694915771, + "num_tokens": 165090729.0, + "step": 4323 + }, + { + "epoch": 0.5500572446253658, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.742415428161621, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8754765391349792, + "num_tokens": 165129771.0, + "step": 4324 + }, + { + "epoch": 0.5501844549039563, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9067388772964478, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8724592924118042, + "num_tokens": 165164310.0, + "step": 4325 + }, + { + "epoch": 0.5503116651825467, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9031808376312256, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8558228015899658, + "num_tokens": 165202842.0, + "step": 4326 + }, + { + "epoch": 0.5504388754611372, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.903212070465088, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.863153338432312, + "num_tokens": 165235998.0, + "step": 4327 + }, + { + "epoch": 0.5505660857397278, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8742389678955078, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8604898452758789, + "num_tokens": 165271779.0, + "step": 4328 + }, + { + "epoch": 0.5506932960183183, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.9070374965667725, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8519696593284607, + "num_tokens": 165306136.0, + "step": 4329 + }, + { + "epoch": 0.5508205062969088, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8463462591171265, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8735455274581909, + "num_tokens": 165342401.0, + "step": 4330 + }, + { + "epoch": 0.5509477165754993, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7822853326797485, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8606655597686768, + "num_tokens": 165380120.0, + "step": 4331 + }, + { + "epoch": 0.5510749268540898, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8586379289627075, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8457014560699463, + "num_tokens": 165418210.0, + "step": 4332 + }, + { + "epoch": 0.5512021371326803, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9845093488693237, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8382377624511719, + "num_tokens": 165460952.0, + "step": 4333 + }, + { + "epoch": 0.5513293474112708, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7768136262893677, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8724732398986816, + "num_tokens": 165500370.0, + "step": 4334 + }, + { + "epoch": 0.5514565576898613, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9778199195861816, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8575608134269714, + "num_tokens": 165534089.0, + "step": 4335 + }, + { + "epoch": 0.5515837679684519, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8618265390396118, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8508797883987427, + "num_tokens": 165573111.0, + "step": 4336 + }, + { + "epoch": 0.5517109782470424, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8220282793045044, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8493994474411011, + "num_tokens": 165611942.0, + "step": 4337 + }, + { + "epoch": 0.5518381885256328, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8504704236984253, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8661473989486694, + "num_tokens": 165651703.0, + "step": 4338 + }, + { + "epoch": 0.5519653988042234, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.878197431564331, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8566726446151733, + "num_tokens": 165691257.0, + "step": 4339 + }, + { + "epoch": 0.5520926090828139, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9677734375, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8741041421890259, + "num_tokens": 165727947.0, + "step": 4340 + }, + { + "epoch": 0.5522198193614044, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9008140563964844, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8671320676803589, + "num_tokens": 165763400.0, + "step": 4341 + }, + { + "epoch": 0.5523470296399949, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7293035984039307, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8593482375144958, + "num_tokens": 165809586.0, + "step": 4342 + }, + { + "epoch": 0.5524742399185855, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9391008615493774, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8624030351638794, + "num_tokens": 165839565.0, + "step": 4343 + }, + { + "epoch": 0.5526014501971759, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8948575258255005, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8632155656814575, + "num_tokens": 165873539.0, + "step": 4344 + }, + { + "epoch": 0.5527286604757664, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7199636697769165, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8670229911804199, + "num_tokens": 165920987.0, + "step": 4345 + }, + { + "epoch": 0.5528558707543569, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.870388150215149, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8483092784881592, + "num_tokens": 165965661.0, + "step": 4346 + }, + { + "epoch": 0.5529830810329475, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9649578332901, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8519116640090942, + "num_tokens": 165998239.0, + "step": 4347 + }, + { + "epoch": 0.553110291311538, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8806390762329102, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8566102981567383, + "num_tokens": 166036489.0, + "step": 4348 + }, + { + "epoch": 0.5532375015901285, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.812669277191162, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8550893664360046, + "num_tokens": 166078774.0, + "step": 4349 + }, + { + "epoch": 0.5533647118687189, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8631867170333862, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8655527830123901, + "num_tokens": 166119555.0, + "step": 4350 + }, + { + "epoch": 0.5534919221473095, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7511340379714966, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8573024868965149, + "num_tokens": 166159788.0, + "step": 4351 + }, + { + "epoch": 0.5536191324259, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9823061227798462, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8540732860565186, + "num_tokens": 166200230.0, + "step": 4352 + }, + { + "epoch": 0.5537463427044905, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.88860285282135, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8629161715507507, + "num_tokens": 166237864.0, + "step": 4353 + }, + { + "epoch": 0.553873552983081, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8245468139648438, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.861197292804718, + "num_tokens": 166278347.0, + "step": 4354 + }, + { + "epoch": 0.5540007632616716, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 16.609067916870117, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8494390845298767, + "num_tokens": 166315744.0, + "step": 4355 + }, + { + "epoch": 0.554127973540262, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.1575703620910645, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8594434261322021, + "num_tokens": 166352705.0, + "step": 4356 + }, + { + "epoch": 0.5542551838188525, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9195839166641235, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8636317253112793, + "num_tokens": 166393956.0, + "step": 4357 + }, + { + "epoch": 0.554382394097443, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.0028555393218994, + "learning_rate": 1e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8353888988494873, + "num_tokens": 166430172.0, + "step": 4358 + }, + { + "epoch": 0.5545096043760336, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8108489513397217, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8586002588272095, + "num_tokens": 166468732.0, + "step": 4359 + }, + { + "epoch": 0.5546368146546241, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7294286489486694, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8530181646347046, + "num_tokens": 166511383.0, + "step": 4360 + }, + { + "epoch": 0.5547640249332146, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8820396661758423, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8621034026145935, + "num_tokens": 166547674.0, + "step": 4361 + }, + { + "epoch": 0.5548912352118052, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9411107301712036, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8516374230384827, + "num_tokens": 166581763.0, + "step": 4362 + }, + { + "epoch": 0.5550184454903956, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.846266508102417, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8557201027870178, + "num_tokens": 166618604.0, + "step": 4363 + }, + { + "epoch": 0.5551456557689861, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.1222522258758545, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8688584566116333, + "num_tokens": 166654812.0, + "step": 4364 + }, + { + "epoch": 0.5552728660475766, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0095407962799072, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8460621237754822, + "num_tokens": 166693206.0, + "step": 4365 + }, + { + "epoch": 0.5554000763261672, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8930021524429321, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8574771881103516, + "num_tokens": 166734531.0, + "step": 4366 + }, + { + "epoch": 0.5555272866047577, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.1663873195648193, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8562912940979004, + "num_tokens": 166771362.0, + "step": 4367 + }, + { + "epoch": 0.5556544968833482, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.1187095642089844, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8482975959777832, + "num_tokens": 166802784.0, + "step": 4368 + }, + { + "epoch": 0.5557817071619386, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.945369839668274, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8546664714813232, + "num_tokens": 166835590.0, + "step": 4369 + }, + { + "epoch": 0.5559089174405292, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9223945140838623, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8535283207893372, + "num_tokens": 166872145.0, + "step": 4370 + }, + { + "epoch": 0.5560361277191197, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9344022274017334, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8539713025093079, + "num_tokens": 166912855.0, + "step": 4371 + }, + { + "epoch": 0.5561633379977102, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9995427131652832, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8440015316009521, + "num_tokens": 166949482.0, + "step": 4372 + }, + { + "epoch": 0.5562905482763008, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8629827499389648, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8588395118713379, + "num_tokens": 166986727.0, + "step": 4373 + }, + { + "epoch": 0.5564177585548913, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8866888284683228, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.850149393081665, + "num_tokens": 167025869.0, + "step": 4374 + }, + { + "epoch": 0.5565449688334817, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9071670770645142, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8682867884635925, + "num_tokens": 167059825.0, + "step": 4375 + }, + { + "epoch": 0.5566721791120722, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8289403915405273, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8629118204116821, + "num_tokens": 167097375.0, + "step": 4376 + }, + { + "epoch": 0.5567993893906628, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8268038034439087, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8498258590698242, + "num_tokens": 167136555.0, + "step": 4377 + }, + { + "epoch": 0.5569265996692533, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.7085801362991333, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.855771541595459, + "num_tokens": 167179870.0, + "step": 4378 + }, + { + "epoch": 0.5570538099478438, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9696260690689087, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8575880527496338, + "num_tokens": 167215242.0, + "step": 4379 + }, + { + "epoch": 0.5571810202264343, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0623738765716553, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8654535412788391, + "num_tokens": 167250690.0, + "step": 4380 + }, + { + "epoch": 0.5573082305050248, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0200321674346924, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8460943698883057, + "num_tokens": 167283677.0, + "step": 4381 + }, + { + "epoch": 0.5574354407836153, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.901597499847412, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8575952649116516, + "num_tokens": 167321355.0, + "step": 4382 + }, + { + "epoch": 0.5575626510622058, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9235050678253174, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8505691885948181, + "num_tokens": 167363103.0, + "step": 4383 + }, + { + "epoch": 0.5576898613407963, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8545081615447998, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8711177110671997, + "num_tokens": 167399732.0, + "step": 4384 + }, + { + "epoch": 0.5578170716193869, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9516689777374268, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8581932783126831, + "num_tokens": 167434938.0, + "step": 4385 + }, + { + "epoch": 0.5579442818979774, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.879741907119751, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8609748482704163, + "num_tokens": 167472293.0, + "step": 4386 + }, + { + "epoch": 0.5580714921765678, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8877588510513306, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8624062538146973, + "num_tokens": 167511015.0, + "step": 4387 + }, + { + "epoch": 0.5581987024551583, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9655100107192993, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.85736083984375, + "num_tokens": 167549777.0, + "step": 4388 + }, + { + "epoch": 0.5583259127337489, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8356767892837524, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8639885187149048, + "num_tokens": 167592342.0, + "step": 4389 + }, + { + "epoch": 0.5584531230123394, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.798861026763916, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8623850345611572, + "num_tokens": 167630829.0, + "step": 4390 + }, + { + "epoch": 0.5585803332909299, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9209381341934204, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8467721343040466, + "num_tokens": 167675938.0, + "step": 4391 + }, + { + "epoch": 0.5587075435695205, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9483163356781006, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.862240195274353, + "num_tokens": 167716550.0, + "step": 4392 + }, + { + "epoch": 0.5588347538481109, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0431926250457764, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8446146249771118, + "num_tokens": 167754401.0, + "step": 4393 + }, + { + "epoch": 0.5589619641267014, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.950596570968628, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8630545735359192, + "num_tokens": 167792124.0, + "step": 4394 + }, + { + "epoch": 0.5590891744052919, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7973161935806274, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8770471215248108, + "num_tokens": 167829226.0, + "step": 4395 + }, + { + "epoch": 0.5592163846838825, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9118714332580566, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8693888187408447, + "num_tokens": 167863439.0, + "step": 4396 + }, + { + "epoch": 0.559343594962473, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9290493726730347, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8594129085540771, + "num_tokens": 167907019.0, + "step": 4397 + }, + { + "epoch": 0.5594708052410635, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8911868333816528, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8514382839202881, + "num_tokens": 167942539.0, + "step": 4398 + }, + { + "epoch": 0.5595980155196539, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9263852834701538, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8584657907485962, + "num_tokens": 167982848.0, + "step": 4399 + }, + { + "epoch": 0.5597252257982445, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9311202764511108, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8668709993362427, + "num_tokens": 168017500.0, + "step": 4400 + }, + { + "epoch": 0.559852436076835, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.023763418197632, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8440439701080322, + "num_tokens": 168052790.0, + "step": 4401 + }, + { + "epoch": 0.5599796463554255, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.085761547088623, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8652408123016357, + "num_tokens": 168082541.0, + "step": 4402 + }, + { + "epoch": 0.560106856634016, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8202133178710938, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8647047281265259, + "num_tokens": 168128218.0, + "step": 4403 + }, + { + "epoch": 0.5602340669126066, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9365828037261963, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8604052066802979, + "num_tokens": 168165496.0, + "step": 4404 + }, + { + "epoch": 0.560361277191197, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.933168649673462, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.845969557762146, + "num_tokens": 168204675.0, + "step": 4405 + }, + { + "epoch": 0.5604884874697875, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9687186479568481, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8533250093460083, + "num_tokens": 168244832.0, + "step": 4406 + }, + { + "epoch": 0.560615697748378, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8624669313430786, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8542863130569458, + "num_tokens": 168282440.0, + "step": 4407 + }, + { + "epoch": 0.5607429080269686, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8451595306396484, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8640301823616028, + "num_tokens": 168325303.0, + "step": 4408 + }, + { + "epoch": 0.5608701183055591, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.1046102046966553, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8651026487350464, + "num_tokens": 168360628.0, + "step": 4409 + }, + { + "epoch": 0.5609973285841496, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9928337335586548, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8463215827941895, + "num_tokens": 168398601.0, + "step": 4410 + }, + { + "epoch": 0.5611245388627402, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.929003119468689, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8448120355606079, + "num_tokens": 168436893.0, + "step": 4411 + }, + { + "epoch": 0.5612517491413306, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.082390308380127, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8528273105621338, + "num_tokens": 168478794.0, + "step": 4412 + }, + { + "epoch": 0.5613789594199211, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9942872524261475, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8555967807769775, + "num_tokens": 168520184.0, + "step": 4413 + }, + { + "epoch": 0.5615061696985116, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9257851839065552, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.854972779750824, + "num_tokens": 168554305.0, + "step": 4414 + }, + { + "epoch": 0.5616333799771022, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8276206254959106, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8421655893325806, + "num_tokens": 168597752.0, + "step": 4415 + }, + { + "epoch": 0.5617605902556927, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.043109655380249, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8639705181121826, + "num_tokens": 168637764.0, + "step": 4416 + }, + { + "epoch": 0.5618878005342832, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9402220249176025, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.839676022529602, + "num_tokens": 168672260.0, + "step": 4417 + }, + { + "epoch": 0.5620150108128736, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.7687441110610962, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8670110702514648, + "num_tokens": 168711478.0, + "step": 4418 + }, + { + "epoch": 0.5621422210914642, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7782725095748901, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8611636757850647, + "num_tokens": 168751773.0, + "step": 4419 + }, + { + "epoch": 0.5622694313700547, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.336193323135376, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.869235098361969, + "num_tokens": 168786364.0, + "step": 4420 + }, + { + "epoch": 0.5623966416486452, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8951407670974731, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8557120561599731, + "num_tokens": 168826819.0, + "step": 4421 + }, + { + "epoch": 0.5625238519272358, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8219377994537354, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8567872643470764, + "num_tokens": 168869405.0, + "step": 4422 + }, + { + "epoch": 0.5626510622058263, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9267336130142212, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8649333715438843, + "num_tokens": 168904323.0, + "step": 4423 + }, + { + "epoch": 0.5627782724844167, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8627257347106934, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.848970353603363, + "num_tokens": 168948654.0, + "step": 4424 + }, + { + "epoch": 0.5629054827630072, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9192001819610596, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8652009963989258, + "num_tokens": 168984333.0, + "step": 4425 + }, + { + "epoch": 0.5630326930415978, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.920609474182129, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8584779500961304, + "num_tokens": 169019843.0, + "step": 4426 + }, + { + "epoch": 0.5631599033201883, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7231640815734863, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8702326416969299, + "num_tokens": 169058177.0, + "step": 4427 + }, + { + "epoch": 0.5632871135987788, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9817347526550293, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8639266490936279, + "num_tokens": 169102268.0, + "step": 4428 + }, + { + "epoch": 0.5634143238773693, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.963091492652893, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8526735305786133, + "num_tokens": 169141709.0, + "step": 4429 + }, + { + "epoch": 0.5635415341559598, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0345025062561035, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8557560443878174, + "num_tokens": 169177107.0, + "step": 4430 + }, + { + "epoch": 0.5636687444345503, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9303542375564575, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8556509613990784, + "num_tokens": 169211531.0, + "step": 4431 + }, + { + "epoch": 0.5637959547131408, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.1291415691375732, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.850826621055603, + "num_tokens": 169247507.0, + "step": 4432 + }, + { + "epoch": 0.5639231649917313, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0401182174682617, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8723864555358887, + "num_tokens": 169281007.0, + "step": 4433 + }, + { + "epoch": 0.5640503752703219, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9264146089553833, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8589015007019043, + "num_tokens": 169322295.0, + "step": 4434 + }, + { + "epoch": 0.5641775855489124, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8810276985168457, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8545976877212524, + "num_tokens": 169358184.0, + "step": 4435 + }, + { + "epoch": 0.5643047958275028, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0804903507232666, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8638471364974976, + "num_tokens": 169391518.0, + "step": 4436 + }, + { + "epoch": 0.5644320061060933, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9412806034088135, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8594521880149841, + "num_tokens": 169426743.0, + "step": 4437 + }, + { + "epoch": 0.5645592163846839, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.665980577468872, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8775244355201721, + "num_tokens": 169466452.0, + "step": 4438 + }, + { + "epoch": 0.5646864266632744, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9950426816940308, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8630011081695557, + "num_tokens": 169500295.0, + "step": 4439 + }, + { + "epoch": 0.5648136369418649, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.073077917098999, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8585845232009888, + "num_tokens": 169541829.0, + "step": 4440 + }, + { + "epoch": 0.5649408472204555, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9893962144851685, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8434531092643738, + "num_tokens": 169581547.0, + "step": 4441 + }, + { + "epoch": 0.5650680574990459, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8340067863464355, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8618011474609375, + "num_tokens": 169622107.0, + "step": 4442 + }, + { + "epoch": 0.5651952677776364, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0590744018554688, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8555352687835693, + "num_tokens": 169657258.0, + "step": 4443 + }, + { + "epoch": 0.5653224780562269, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0574283599853516, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.848267674446106, + "num_tokens": 169691568.0, + "step": 4444 + }, + { + "epoch": 0.5654496883348175, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.9411119222640991, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8647013306617737, + "num_tokens": 169732429.0, + "step": 4445 + }, + { + "epoch": 0.565576898613408, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.89755380153656, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8509770631790161, + "num_tokens": 169769224.0, + "step": 4446 + }, + { + "epoch": 0.5657041088919985, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7553880214691162, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8582800030708313, + "num_tokens": 169812830.0, + "step": 4447 + }, + { + "epoch": 0.5658313191705889, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.0127038955688477, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8559324741363525, + "num_tokens": 169843823.0, + "step": 4448 + }, + { + "epoch": 0.5659585294491795, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8390144109725952, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8609486222267151, + "num_tokens": 169882847.0, + "step": 4449 + }, + { + "epoch": 0.56608573972777, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 2.1842610836029053, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8398732542991638, + "num_tokens": 169921264.0, + "step": 4450 + }, + { + "epoch": 0.5662129500063605, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9408248662948608, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.86607426404953, + "num_tokens": 169949681.0, + "step": 4451 + }, + { + "epoch": 0.566340160284951, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8736579418182373, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8717869520187378, + "num_tokens": 169986254.0, + "step": 4452 + }, + { + "epoch": 0.5664673705635416, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.6678261756896973, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8545180559158325, + "num_tokens": 170022745.0, + "step": 4453 + }, + { + "epoch": 0.566594580842132, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8859498500823975, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8700110912322998, + "num_tokens": 170057038.0, + "step": 4454 + }, + { + "epoch": 0.5667217911207225, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.781506896018982, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.876768946647644, + "num_tokens": 170098460.0, + "step": 4455 + }, + { + "epoch": 0.566849001399313, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.928885579109192, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.84681236743927, + "num_tokens": 170137689.0, + "step": 4456 + }, + { + "epoch": 0.5669762116779036, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9148024320602417, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8586416840553284, + "num_tokens": 170176716.0, + "step": 4457 + }, + { + "epoch": 0.5671034219564941, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.0469539165496826, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8521267175674438, + "num_tokens": 170212621.0, + "step": 4458 + }, + { + "epoch": 0.5672306322350846, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8794692754745483, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8489415645599365, + "num_tokens": 170254493.0, + "step": 4459 + }, + { + "epoch": 0.5673578425136752, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.7835521697998047, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8793452978134155, + "num_tokens": 170290559.0, + "step": 4460 + }, + { + "epoch": 0.5674850527922656, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.7043415307998657, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8524020314216614, + "num_tokens": 170335079.0, + "step": 4461 + }, + { + "epoch": 0.5676122630708561, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.832692265510559, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8622854948043823, + "num_tokens": 170370665.0, + "step": 4462 + }, + { + "epoch": 0.5677394733494466, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.794616460800171, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8585590720176697, + "num_tokens": 170412585.0, + "step": 4463 + }, + { + "epoch": 0.5678666836280372, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.7839901447296143, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8647377490997314, + "num_tokens": 170450806.0, + "step": 4464 + }, + { + "epoch": 0.5679938939066277, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.110638380050659, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8582243919372559, + "num_tokens": 170483265.0, + "step": 4465 + }, + { + "epoch": 0.5681211041852182, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9623945951461792, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8753300905227661, + "num_tokens": 170520910.0, + "step": 4466 + }, + { + "epoch": 0.5682483144638086, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.309077024459839, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8417313694953918, + "num_tokens": 170556516.0, + "step": 4467 + }, + { + "epoch": 0.5683755247423992, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9971227645874023, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.857080340385437, + "num_tokens": 170595571.0, + "step": 4468 + }, + { + "epoch": 0.5685027350209897, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9399337768554688, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8778055906295776, + "num_tokens": 170634117.0, + "step": 4469 + }, + { + "epoch": 0.5686299452995802, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8335494995117188, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8530058264732361, + "num_tokens": 170677624.0, + "step": 4470 + }, + { + "epoch": 0.5687571555781707, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.845763087272644, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8446400165557861, + "num_tokens": 170716732.0, + "step": 4471 + }, + { + "epoch": 0.5688843658567613, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.7945245504379272, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8527284264564514, + "num_tokens": 170758521.0, + "step": 4472 + }, + { + "epoch": 0.5690115761353517, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.814428448677063, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8584226369857788, + "num_tokens": 170796799.0, + "step": 4473 + }, + { + "epoch": 0.5691387864139422, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.905452847480774, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8451891541481018, + "num_tokens": 170838569.0, + "step": 4474 + }, + { + "epoch": 0.5692659966925327, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.7910164594650269, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8609151840209961, + "num_tokens": 170878206.0, + "step": 4475 + }, + { + "epoch": 0.5693932069711233, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.003436326980591, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8459614515304565, + "num_tokens": 170915506.0, + "step": 4476 + }, + { + "epoch": 0.5695204172497138, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.829190731048584, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8547922372817993, + "num_tokens": 170952303.0, + "step": 4477 + }, + { + "epoch": 0.5696476275283043, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9557958841323853, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8515976667404175, + "num_tokens": 170993757.0, + "step": 4478 + }, + { + "epoch": 0.5697748378068948, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9663496017456055, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8557208776473999, + "num_tokens": 171030240.0, + "step": 4479 + }, + { + "epoch": 0.5699020480854853, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 3.170253276824951, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.834911584854126, + "num_tokens": 171069654.0, + "step": 4480 + }, + { + "epoch": 0.5700292583640758, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.0160231590270996, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8421788811683655, + "num_tokens": 171107341.0, + "step": 4481 + }, + { + "epoch": 0.5701564686426663, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9990559816360474, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.84791100025177, + "num_tokens": 171144063.0, + "step": 4482 + }, + { + "epoch": 0.5702836789212569, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.100822687149048, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8515117764472961, + "num_tokens": 171180448.0, + "step": 4483 + }, + { + "epoch": 0.5704108891998474, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9941672086715698, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8618624806404114, + "num_tokens": 171214220.0, + "step": 4484 + }, + { + "epoch": 0.5705380994784378, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8715137243270874, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8645976781845093, + "num_tokens": 171252798.0, + "step": 4485 + }, + { + "epoch": 0.5706653097570283, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9402952194213867, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8628825545310974, + "num_tokens": 171295434.0, + "step": 4486 + }, + { + "epoch": 0.5707925200356189, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8176075220108032, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8599390983581543, + "num_tokens": 171334129.0, + "step": 4487 + }, + { + "epoch": 0.5709197303142094, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8742856979370117, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8542478084564209, + "num_tokens": 171370239.0, + "step": 4488 + }, + { + "epoch": 0.5710469405927999, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9600048065185547, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8729904890060425, + "num_tokens": 171402411.0, + "step": 4489 + }, + { + "epoch": 0.5711741508713905, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8065555095672607, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8683122396469116, + "num_tokens": 171438736.0, + "step": 4490 + }, + { + "epoch": 0.5713013611499809, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9317591190338135, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8630654811859131, + "num_tokens": 171477949.0, + "step": 4491 + }, + { + "epoch": 0.5714285714285714, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9215530157089233, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8534568548202515, + "num_tokens": 171520953.0, + "step": 4492 + }, + { + "epoch": 0.5715557817071619, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.7970443964004517, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8647081255912781, + "num_tokens": 171559415.0, + "step": 4493 + }, + { + "epoch": 0.5716829919857525, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8589394092559814, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8663668036460876, + "num_tokens": 171593118.0, + "step": 4494 + }, + { + "epoch": 0.571810202264343, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.0671629905700684, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8453778028488159, + "num_tokens": 171634514.0, + "step": 4495 + }, + { + "epoch": 0.5719374125429335, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.983624815940857, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8612244129180908, + "num_tokens": 171675239.0, + "step": 4496 + }, + { + "epoch": 0.5720646228215239, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.99587082862854, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8660533428192139, + "num_tokens": 171713154.0, + "step": 4497 + }, + { + "epoch": 0.5721918331001145, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.0210671424865723, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.850689709186554, + "num_tokens": 171750046.0, + "step": 4498 + }, + { + "epoch": 0.572319043378705, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9764701128005981, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8477627038955688, + "num_tokens": 171782708.0, + "step": 4499 + }, + { + "epoch": 0.5724462536572955, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8503923416137695, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8545519709587097, + "num_tokens": 171819421.0, + "step": 4500 + }, + { + "epoch": 0.572573463935886, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.8383748531341553, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8569864630699158, + "num_tokens": 171860993.0, + "step": 4501 + }, + { + "epoch": 0.5727006742144766, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9229909181594849, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8582988977432251, + "num_tokens": 171896894.0, + "step": 4502 + }, + { + "epoch": 0.572827884493067, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9244545698165894, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8560643196105957, + "num_tokens": 171938694.0, + "step": 4503 + }, + { + "epoch": 0.5729550947716575, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.036003828048706, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8827464580535889, + "num_tokens": 171972606.0, + "step": 4504 + }, + { + "epoch": 0.573082305050248, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9355665445327759, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.860170841217041, + "num_tokens": 172010861.0, + "step": 4505 + }, + { + "epoch": 0.5732095153288386, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.9316974878311157, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8436648845672607, + "num_tokens": 172049867.0, + "step": 4506 + }, + { + "epoch": 0.5733367256074291, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.778851866722107, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.85845947265625, + "num_tokens": 172092176.0, + "step": 4507 + }, + { + "epoch": 0.5734639358860196, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.0113589763641357, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8579465746879578, + "num_tokens": 172132789.0, + "step": 4508 + }, + { + "epoch": 0.5735911461646102, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.7948980331420898, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8718490600585938, + "num_tokens": 172170536.0, + "step": 4509 + }, + { + "epoch": 0.5737183564432006, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.996239423751831, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8577877283096313, + "num_tokens": 172206828.0, + "step": 4510 + }, + { + "epoch": 0.5738455667217911, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.8804516792297363, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8497579097747803, + "num_tokens": 172242311.0, + "step": 4511 + }, + { + "epoch": 0.5739727770003816, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.04660964012146, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8546347618103027, + "num_tokens": 172275135.0, + "step": 4512 + }, + { + "epoch": 0.5740999872789722, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.9080960750579834, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8669725656509399, + "num_tokens": 172308682.0, + "step": 4513 + }, + { + "epoch": 0.5742271975575627, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.8516017198562622, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8539561629295349, + "num_tokens": 172345834.0, + "step": 4514 + }, + { + "epoch": 0.5743544078361532, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.0086352825164795, + "learning_rate": 1e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.8354191780090332, + "num_tokens": 172387976.0, + "step": 4515 + }, + { + "epoch": 0.5744816181147436, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.761318325996399, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8590839505195618, + "num_tokens": 172427447.0, + "step": 4516 + }, + { + "epoch": 0.5746088283933342, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.8363537788391113, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8500943183898926, + "num_tokens": 172468686.0, + "step": 4517 + }, + { + "epoch": 0.5747360386719247, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.6232709884643555, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8538936376571655, + "num_tokens": 172508190.0, + "step": 4518 + }, + { + "epoch": 0.5748632489505152, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.6749306917190552, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8724590539932251, + "num_tokens": 172552295.0, + "step": 4519 + }, + { + "epoch": 0.5749904592291057, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.7795501947402954, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8633712530136108, + "num_tokens": 172591065.0, + "step": 4520 + }, + { + "epoch": 0.5751176695076963, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.6772769689559937, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8612151145935059, + "num_tokens": 172633741.0, + "step": 4521 + }, + { + "epoch": 0.5752448797862867, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.8413407802581787, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8670746088027954, + "num_tokens": 172669081.0, + "step": 4522 + }, + { + "epoch": 0.5753720900648772, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.0456151962280273, + "learning_rate": 1e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8359426259994507, + "num_tokens": 172704956.0, + "step": 4523 + }, + { + "epoch": 0.5754993003434677, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.8842394351959229, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8615900278091431, + "num_tokens": 172742270.0, + "step": 4524 + }, + { + "epoch": 0.5756265106220583, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.9072849750518799, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8459669351577759, + "num_tokens": 172778065.0, + "step": 4525 + }, + { + "epoch": 0.5757537209006488, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.7442127466201782, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8675366044044495, + "num_tokens": 172815825.0, + "step": 4526 + }, + { + "epoch": 0.5758809311792393, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.5433881282806396, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8582038879394531, + "num_tokens": 172849560.0, + "step": 4527 + }, + { + "epoch": 0.5760081414578297, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.154003858566284, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8585920333862305, + "num_tokens": 172885203.0, + "step": 4528 + }, + { + "epoch": 0.5761353517364203, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9773712158203125, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8574696779251099, + "num_tokens": 172922341.0, + "step": 4529 + }, + { + "epoch": 0.5762625620150108, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.037205934524536, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8642143607139587, + "num_tokens": 172953363.0, + "step": 4530 + }, + { + "epoch": 0.5763897722936013, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.03778076171875, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8639932870864868, + "num_tokens": 172986253.0, + "step": 4531 + }, + { + "epoch": 0.5765169825721919, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.832853078842163, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.869880199432373, + "num_tokens": 173023466.0, + "step": 4532 + }, + { + "epoch": 0.5766441928507824, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9020143747329712, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8520382642745972, + "num_tokens": 173063329.0, + "step": 4533 + }, + { + "epoch": 0.5767714031293728, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.845064401626587, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8643683195114136, + "num_tokens": 173100676.0, + "step": 4534 + }, + { + "epoch": 0.5768986134079633, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.681397557258606, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8694978356361389, + "num_tokens": 173135088.0, + "step": 4535 + }, + { + "epoch": 0.5770258236865539, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9100079536437988, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8498552441596985, + "num_tokens": 173175810.0, + "step": 4536 + }, + { + "epoch": 0.5771530339651444, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7142068147659302, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8595609664916992, + "num_tokens": 173220826.0, + "step": 4537 + }, + { + "epoch": 0.5772802442437349, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.895172119140625, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8687683939933777, + "num_tokens": 173258411.0, + "step": 4538 + }, + { + "epoch": 0.5774074545223254, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8305261135101318, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8421457409858704, + "num_tokens": 173297298.0, + "step": 4539 + }, + { + "epoch": 0.5775346648009159, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.048638343811035, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8411898612976074, + "num_tokens": 173330839.0, + "step": 4540 + }, + { + "epoch": 0.5776618750795064, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.860309362411499, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8505237102508545, + "num_tokens": 173370219.0, + "step": 4541 + }, + { + "epoch": 0.5777890853580969, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8341715335845947, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8589602708816528, + "num_tokens": 173410425.0, + "step": 4542 + }, + { + "epoch": 0.5779162956366874, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9183728694915771, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8587384223937988, + "num_tokens": 173447109.0, + "step": 4543 + }, + { + "epoch": 0.578043505915278, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9315454959869385, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8615555763244629, + "num_tokens": 173490064.0, + "step": 4544 + }, + { + "epoch": 0.5781707161938685, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.022191286087036, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8618594408035278, + "num_tokens": 173523791.0, + "step": 4545 + }, + { + "epoch": 0.5782979264724589, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8834576606750488, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8669593334197998, + "num_tokens": 173564069.0, + "step": 4546 + }, + { + "epoch": 0.5784251367510495, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.110692262649536, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8577736616134644, + "num_tokens": 173601893.0, + "step": 4547 + }, + { + "epoch": 0.57855234702964, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8536266088485718, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8682029843330383, + "num_tokens": 173642299.0, + "step": 4548 + }, + { + "epoch": 0.5786795573082305, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9934628009796143, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8604271411895752, + "num_tokens": 173675824.0, + "step": 4549 + }, + { + "epoch": 0.578806767586821, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9505785703659058, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8599289655685425, + "num_tokens": 173712192.0, + "step": 4550 + }, + { + "epoch": 0.5789339778654116, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8373997211456299, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8655608296394348, + "num_tokens": 173746949.0, + "step": 4551 + }, + { + "epoch": 0.579061188144002, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8245503902435303, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.860121488571167, + "num_tokens": 173784527.0, + "step": 4552 + }, + { + "epoch": 0.5791883984225925, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8584766387939453, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8648210763931274, + "num_tokens": 173820648.0, + "step": 4553 + }, + { + "epoch": 0.579315608701183, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7490371465682983, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8528109788894653, + "num_tokens": 173860427.0, + "step": 4554 + }, + { + "epoch": 0.5794428189797736, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9974185228347778, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.861433744430542, + "num_tokens": 173899876.0, + "step": 4555 + }, + { + "epoch": 0.5795700292583641, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.806297779083252, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8676295280456543, + "num_tokens": 173937354.0, + "step": 4556 + }, + { + "epoch": 0.5796972395369546, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9294689893722534, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8475120067596436, + "num_tokens": 173976039.0, + "step": 4557 + }, + { + "epoch": 0.5798244498155452, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.147597551345825, + "learning_rate": 1e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8378413915634155, + "num_tokens": 174006359.0, + "step": 4558 + }, + { + "epoch": 0.5799516600941356, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9422285556793213, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.85011225938797, + "num_tokens": 174044662.0, + "step": 4559 + }, + { + "epoch": 0.5800788703727261, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0429108142852783, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8543370962142944, + "num_tokens": 174077133.0, + "step": 4560 + }, + { + "epoch": 0.5802060806513166, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8267534971237183, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8540184497833252, + "num_tokens": 174121873.0, + "step": 4561 + }, + { + "epoch": 0.5803332909299072, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8800246715545654, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8592448234558105, + "num_tokens": 174158873.0, + "step": 4562 + }, + { + "epoch": 0.5804605012084977, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.16585373878479, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8656368255615234, + "num_tokens": 174189222.0, + "step": 4563 + }, + { + "epoch": 0.5805877114870882, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0480549335479736, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.850483775138855, + "num_tokens": 174229018.0, + "step": 4564 + }, + { + "epoch": 0.5807149217656786, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8677375316619873, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8599233627319336, + "num_tokens": 174269215.0, + "step": 4565 + }, + { + "epoch": 0.5808421320442692, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0043933391571045, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8672215938568115, + "num_tokens": 174303926.0, + "step": 4566 + }, + { + "epoch": 0.5809693423228597, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8460155725479126, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8601956367492676, + "num_tokens": 174347491.0, + "step": 4567 + }, + { + "epoch": 0.5810965526014502, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.853829026222229, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8598920702934265, + "num_tokens": 174384246.0, + "step": 4568 + }, + { + "epoch": 0.5812237628800407, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9759880304336548, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8581250905990601, + "num_tokens": 174417081.0, + "step": 4569 + }, + { + "epoch": 0.5813509731586313, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8270138502120972, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8473424911499023, + "num_tokens": 174457364.0, + "step": 4570 + }, + { + "epoch": 0.5814781834372217, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8517969846725464, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8595923185348511, + "num_tokens": 174494304.0, + "step": 4571 + }, + { + "epoch": 0.5816053937158122, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.022078275680542, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.859290361404419, + "num_tokens": 174528047.0, + "step": 4572 + }, + { + "epoch": 0.5817326039944027, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.8489794731140137, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8641014695167542, + "num_tokens": 174565990.0, + "step": 4573 + }, + { + "epoch": 0.5818598142729933, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9162241220474243, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8439226746559143, + "num_tokens": 174599039.0, + "step": 4574 + }, + { + "epoch": 0.5819870245515838, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8773113489151, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8524996638298035, + "num_tokens": 174634786.0, + "step": 4575 + }, + { + "epoch": 0.5821142348301743, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9491828680038452, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.849602460861206, + "num_tokens": 174677875.0, + "step": 4576 + }, + { + "epoch": 0.5822414451087647, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8137668371200562, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8520710468292236, + "num_tokens": 174715421.0, + "step": 4577 + }, + { + "epoch": 0.5823686553873553, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.086979627609253, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8547139167785645, + "num_tokens": 174746555.0, + "step": 4578 + }, + { + "epoch": 0.5824958656659458, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8701989650726318, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8469209671020508, + "num_tokens": 174786470.0, + "step": 4579 + }, + { + "epoch": 0.5826230759445363, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8285157680511475, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8560057282447815, + "num_tokens": 174827131.0, + "step": 4580 + }, + { + "epoch": 0.5827502862231269, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8897675275802612, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8504117727279663, + "num_tokens": 174861391.0, + "step": 4581 + }, + { + "epoch": 0.5828774965017174, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8849598169326782, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8630272150039673, + "num_tokens": 174897437.0, + "step": 4582 + }, + { + "epoch": 0.5830047067803078, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.936524748802185, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8486464023590088, + "num_tokens": 174932504.0, + "step": 4583 + }, + { + "epoch": 0.5831319170588983, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8616899251937866, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8527988195419312, + "num_tokens": 174974746.0, + "step": 4584 + }, + { + "epoch": 0.5832591273374889, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.945924162864685, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8522641658782959, + "num_tokens": 175016155.0, + "step": 4585 + }, + { + "epoch": 0.5833863376160794, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9523028135299683, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8435947299003601, + "num_tokens": 175060595.0, + "step": 4586 + }, + { + "epoch": 0.5835135478946699, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8061482906341553, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8571991324424744, + "num_tokens": 175101278.0, + "step": 4587 + }, + { + "epoch": 0.5836407581732604, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9500744342803955, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8509283065795898, + "num_tokens": 175139479.0, + "step": 4588 + }, + { + "epoch": 0.5837679684518509, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8335802555084229, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8409953117370605, + "num_tokens": 175182376.0, + "step": 4589 + }, + { + "epoch": 0.5838951787304414, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8952692747116089, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8506871461868286, + "num_tokens": 175225449.0, + "step": 4590 + }, + { + "epoch": 0.5840223890090319, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.075181722640991, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8558332324028015, + "num_tokens": 175255959.0, + "step": 4591 + }, + { + "epoch": 0.5841495992876224, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9308936595916748, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8452208638191223, + "num_tokens": 175292142.0, + "step": 4592 + }, + { + "epoch": 0.584276809566213, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9016584157943726, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8504396677017212, + "num_tokens": 175336630.0, + "step": 4593 + }, + { + "epoch": 0.5844040198448035, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.012033462524414, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8696938753128052, + "num_tokens": 175372697.0, + "step": 4594 + }, + { + "epoch": 0.5845312301233939, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0121612548828125, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8459169864654541, + "num_tokens": 175412395.0, + "step": 4595 + }, + { + "epoch": 0.5846584404019844, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1224234104156494, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8570410013198853, + "num_tokens": 175447295.0, + "step": 4596 + }, + { + "epoch": 0.584785650680575, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9887850284576416, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8659089803695679, + "num_tokens": 175480229.0, + "step": 4597 + }, + { + "epoch": 0.5849128609591655, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8490580320358276, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8569115996360779, + "num_tokens": 175523081.0, + "step": 4598 + }, + { + "epoch": 0.585040071237756, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8221783638000488, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8607717752456665, + "num_tokens": 175565036.0, + "step": 4599 + }, + { + "epoch": 0.5851672815163466, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9010066986083984, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8572918176651001, + "num_tokens": 175606733.0, + "step": 4600 + }, + { + "epoch": 0.585294491794937, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0389621257781982, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8720913529396057, + "num_tokens": 175637663.0, + "step": 4601 + }, + { + "epoch": 0.5854217020735275, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1880767345428467, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8594317436218262, + "num_tokens": 175677381.0, + "step": 4602 + }, + { + "epoch": 0.585548912352118, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1105387210845947, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.868026614189148, + "num_tokens": 175713049.0, + "step": 4603 + }, + { + "epoch": 0.5856761226307086, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8946537971496582, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8480930924415588, + "num_tokens": 175753681.0, + "step": 4604 + }, + { + "epoch": 0.5858033329092991, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.9748769998550415, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8449219465255737, + "num_tokens": 175789725.0, + "step": 4605 + }, + { + "epoch": 0.5859305431878896, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9877489805221558, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8563251495361328, + "num_tokens": 175823203.0, + "step": 4606 + }, + { + "epoch": 0.5860577534664801, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9193062782287598, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8526540398597717, + "num_tokens": 175860506.0, + "step": 4607 + }, + { + "epoch": 0.5861849637450706, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9192066192626953, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8740195035934448, + "num_tokens": 175895080.0, + "step": 4608 + }, + { + "epoch": 0.5863121740236611, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7801693677902222, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.851388692855835, + "num_tokens": 175937113.0, + "step": 4609 + }, + { + "epoch": 0.5864393843022516, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.96587073802948, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8689083456993103, + "num_tokens": 175966193.0, + "step": 4610 + }, + { + "epoch": 0.5865665945808421, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9227428436279297, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.850886344909668, + "num_tokens": 176003771.0, + "step": 4611 + }, + { + "epoch": 0.5866938048594327, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8267219066619873, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8635138273239136, + "num_tokens": 176050069.0, + "step": 4612 + }, + { + "epoch": 0.5868210151380232, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.891985535621643, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8722594976425171, + "num_tokens": 176088763.0, + "step": 4613 + }, + { + "epoch": 0.5869482254166136, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.673349142074585, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8588775992393494, + "num_tokens": 176127853.0, + "step": 4614 + }, + { + "epoch": 0.5870754356952042, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8869638442993164, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8435499668121338, + "num_tokens": 176173775.0, + "step": 4615 + }, + { + "epoch": 0.5872026459737947, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7571232318878174, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8619204759597778, + "num_tokens": 176210878.0, + "step": 4616 + }, + { + "epoch": 0.5873298562523852, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.9121464490890503, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8519460558891296, + "num_tokens": 176255094.0, + "step": 4617 + }, + { + "epoch": 0.5874570665309757, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9261209964752197, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8543359041213989, + "num_tokens": 176293586.0, + "step": 4618 + }, + { + "epoch": 0.5875842768095663, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.990188717842102, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8594446182250977, + "num_tokens": 176329203.0, + "step": 4619 + }, + { + "epoch": 0.5877114870881567, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7560436725616455, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8745435476303101, + "num_tokens": 176367984.0, + "step": 4620 + }, + { + "epoch": 0.5878386973667472, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7871085405349731, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8707041144371033, + "num_tokens": 176408949.0, + "step": 4621 + }, + { + "epoch": 0.5879659076453377, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.7085695266723633, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8731842637062073, + "num_tokens": 176451073.0, + "step": 4622 + }, + { + "epoch": 0.5880931179239283, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.1662437915802, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8640533089637756, + "num_tokens": 176490623.0, + "step": 4623 + }, + { + "epoch": 0.5882203282025188, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0087506771087646, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.846138596534729, + "num_tokens": 176525771.0, + "step": 4624 + }, + { + "epoch": 0.5883475384811093, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8295254707336426, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8766226768493652, + "num_tokens": 176565417.0, + "step": 4625 + }, + { + "epoch": 0.5884747487596997, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0264699459075928, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8550777435302734, + "num_tokens": 176599005.0, + "step": 4626 + }, + { + "epoch": 0.5886019590382903, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7840807437896729, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8604283332824707, + "num_tokens": 176637947.0, + "step": 4627 + }, + { + "epoch": 0.5887291693168808, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.9604229927062988, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8476558923721313, + "num_tokens": 176675241.0, + "step": 4628 + }, + { + "epoch": 0.5888563795954713, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0654921531677246, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8513362407684326, + "num_tokens": 176715375.0, + "step": 4629 + }, + { + "epoch": 0.5889835898740619, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9989254474639893, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8560656905174255, + "num_tokens": 176750977.0, + "step": 4630 + }, + { + "epoch": 0.5891108001526524, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.9688843488693237, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8618738055229187, + "num_tokens": 176788968.0, + "step": 4631 + }, + { + "epoch": 0.5892380104312428, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.986634612083435, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8536250591278076, + "num_tokens": 176824399.0, + "step": 4632 + }, + { + "epoch": 0.5893652207098333, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9433358907699585, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8537317514419556, + "num_tokens": 176862053.0, + "step": 4633 + }, + { + "epoch": 0.5894924309884239, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0153706073760986, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8720303177833557, + "num_tokens": 176896943.0, + "step": 4634 + }, + { + "epoch": 0.5896196412670144, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8296692371368408, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.859359622001648, + "num_tokens": 176936263.0, + "step": 4635 + }, + { + "epoch": 0.5897468515456049, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.04736328125, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8464892506599426, + "num_tokens": 176976632.0, + "step": 4636 + }, + { + "epoch": 0.5898740618241954, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0093276500701904, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8636935949325562, + "num_tokens": 177007036.0, + "step": 4637 + }, + { + "epoch": 0.5900012721027859, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9239522218704224, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8490127325057983, + "num_tokens": 177042189.0, + "step": 4638 + }, + { + "epoch": 0.5901284823813764, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0876564979553223, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8534010648727417, + "num_tokens": 177082428.0, + "step": 4639 + }, + { + "epoch": 0.5902556926599669, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9201991558074951, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8596034049987793, + "num_tokens": 177124447.0, + "step": 4640 + }, + { + "epoch": 0.5903829029385574, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8274140357971191, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8733417391777039, + "num_tokens": 177160516.0, + "step": 4641 + }, + { + "epoch": 0.590510113217148, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.816623568534851, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.866960883140564, + "num_tokens": 177198100.0, + "step": 4642 + }, + { + "epoch": 0.5906373234957385, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.778436303138733, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8512556552886963, + "num_tokens": 177243170.0, + "step": 4643 + }, + { + "epoch": 0.5907645337743289, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8317118883132935, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8566247820854187, + "num_tokens": 177282976.0, + "step": 4644 + }, + { + "epoch": 0.5908917440529194, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8912876844406128, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8414713144302368, + "num_tokens": 177323725.0, + "step": 4645 + }, + { + "epoch": 0.59101895433151, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.0862350463867188, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8578263521194458, + "num_tokens": 177352178.0, + "step": 4646 + }, + { + "epoch": 0.5911461646101005, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 2.102283000946045, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8553061485290527, + "num_tokens": 177392322.0, + "step": 4647 + }, + { + "epoch": 0.591273374888691, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8619426488876343, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8554177284240723, + "num_tokens": 177432013.0, + "step": 4648 + }, + { + "epoch": 0.5914005851672816, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8340959548950195, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8709279298782349, + "num_tokens": 177473542.0, + "step": 4649 + }, + { + "epoch": 0.591527795445872, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8327605724334717, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.854854941368103, + "num_tokens": 177516590.0, + "step": 4650 + }, + { + "epoch": 0.5916550057244625, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.8281828165054321, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8713642358779907, + "num_tokens": 177553750.0, + "step": 4651 + }, + { + "epoch": 0.591782216003053, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.045128345489502, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.852143406867981, + "num_tokens": 177586093.0, + "step": 4652 + }, + { + "epoch": 0.5919094262816436, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.772139072418213, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8546594977378845, + "num_tokens": 177626800.0, + "step": 4653 + }, + { + "epoch": 0.5920366365602341, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.8481616973876953, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.856574296951294, + "num_tokens": 177662336.0, + "step": 4654 + }, + { + "epoch": 0.5921638468388246, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7558770179748535, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.865221381187439, + "num_tokens": 177706922.0, + "step": 4655 + }, + { + "epoch": 0.592291057117415, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9499950408935547, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8558483719825745, + "num_tokens": 177743922.0, + "step": 4656 + }, + { + "epoch": 0.5924182673960056, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.9475456476211548, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8668357133865356, + "num_tokens": 177777069.0, + "step": 4657 + }, + { + "epoch": 0.5925454776745961, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1980113983154297, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8663145899772644, + "num_tokens": 177810959.0, + "step": 4658 + }, + { + "epoch": 0.5926726879531866, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9938254356384277, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8473635911941528, + "num_tokens": 177846366.0, + "step": 4659 + }, + { + "epoch": 0.5927998982317771, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.832946538925171, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8705483078956604, + "num_tokens": 177881450.0, + "step": 4660 + }, + { + "epoch": 0.5929271085103677, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7108697891235352, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8585439920425415, + "num_tokens": 177922931.0, + "step": 4661 + }, + { + "epoch": 0.5930543187889582, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.9193923473358154, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8402057886123657, + "num_tokens": 177962722.0, + "step": 4662 + }, + { + "epoch": 0.5931815290675486, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.025473117828369, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8570215702056885, + "num_tokens": 177999956.0, + "step": 4663 + }, + { + "epoch": 0.5933087393461391, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9571266174316406, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8485368490219116, + "num_tokens": 178032450.0, + "step": 4664 + }, + { + "epoch": 0.5934359496247297, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.774316668510437, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8583242893218994, + "num_tokens": 178075028.0, + "step": 4665 + }, + { + "epoch": 0.5935631599033202, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0875792503356934, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8534648418426514, + "num_tokens": 178108267.0, + "step": 4666 + }, + { + "epoch": 0.5936903701819107, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8183163404464722, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8579953908920288, + "num_tokens": 178148091.0, + "step": 4667 + }, + { + "epoch": 0.5938175804605013, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9954092502593994, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8705376386642456, + "num_tokens": 178185626.0, + "step": 4668 + }, + { + "epoch": 0.5939447907390917, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9015382528305054, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8660107851028442, + "num_tokens": 178224165.0, + "step": 4669 + }, + { + "epoch": 0.5940720010176822, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8940547704696655, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8575706481933594, + "num_tokens": 178264118.0, + "step": 4670 + }, + { + "epoch": 0.5941992112962727, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1249265670776367, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8691174983978271, + "num_tokens": 178295634.0, + "step": 4671 + }, + { + "epoch": 0.5943264215748633, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1316847801208496, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8653489351272583, + "num_tokens": 178327076.0, + "step": 4672 + }, + { + "epoch": 0.5944536318534538, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.03757905960083, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8501577377319336, + "num_tokens": 178357469.0, + "step": 4673 + }, + { + "epoch": 0.5945808421320443, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.865445613861084, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8645882606506348, + "num_tokens": 178393830.0, + "step": 4674 + }, + { + "epoch": 0.5947080524106347, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.003645420074463, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8574646711349487, + "num_tokens": 178427560.0, + "step": 4675 + }, + { + "epoch": 0.5948352626892253, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9292389154434204, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8582250475883484, + "num_tokens": 178464666.0, + "step": 4676 + }, + { + "epoch": 0.5949624729678158, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8691294193267822, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8486050367355347, + "num_tokens": 178508771.0, + "step": 4677 + }, + { + "epoch": 0.5950896832464063, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8364099264144897, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8620352745056152, + "num_tokens": 178548516.0, + "step": 4678 + }, + { + "epoch": 0.5952168935249968, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0129997730255127, + "learning_rate": 1e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.8230117559432983, + "num_tokens": 178591276.0, + "step": 4679 + }, + { + "epoch": 0.5953441038035874, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8178553581237793, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8573859930038452, + "num_tokens": 178632580.0, + "step": 4680 + }, + { + "epoch": 0.5954713140821778, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.96535062789917, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8538748025894165, + "num_tokens": 178672858.0, + "step": 4681 + }, + { + "epoch": 0.5955985243607683, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9117176532745361, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8611003160476685, + "num_tokens": 178706435.0, + "step": 4682 + }, + { + "epoch": 0.5957257346393589, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.916025161743164, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8550965189933777, + "num_tokens": 178746309.0, + "step": 4683 + }, + { + "epoch": 0.5958529449179494, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.846226453781128, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8395355343818665, + "num_tokens": 178787639.0, + "step": 4684 + }, + { + "epoch": 0.5959801551965399, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8496589660644531, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8637163639068604, + "num_tokens": 178828271.0, + "step": 4685 + }, + { + "epoch": 0.5961073654751304, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9495792388916016, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8791568279266357, + "num_tokens": 178862862.0, + "step": 4686 + }, + { + "epoch": 0.5962345757537209, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.806044340133667, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8697713613510132, + "num_tokens": 178897969.0, + "step": 4687 + }, + { + "epoch": 0.5963617860323114, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7279592752456665, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8616700172424316, + "num_tokens": 178942002.0, + "step": 4688 + }, + { + "epoch": 0.5964889963109019, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7530499696731567, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8675594925880432, + "num_tokens": 178983473.0, + "step": 4689 + }, + { + "epoch": 0.5966162065894924, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9859709739685059, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8469651341438293, + "num_tokens": 179022370.0, + "step": 4690 + }, + { + "epoch": 0.596743416868083, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.6930968761444092, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8727060556411743, + "num_tokens": 179064841.0, + "step": 4691 + }, + { + "epoch": 0.5968706271466735, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9382158517837524, + "learning_rate": 1e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.8306349515914917, + "num_tokens": 179099399.0, + "step": 4692 + }, + { + "epoch": 0.5969978374252639, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.043867588043213, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8508737087249756, + "num_tokens": 179133969.0, + "step": 4693 + }, + { + "epoch": 0.5971250477038544, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9276959896087646, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8556821346282959, + "num_tokens": 179165456.0, + "step": 4694 + }, + { + "epoch": 0.597252257982445, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0536458492279053, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8623756170272827, + "num_tokens": 179198209.0, + "step": 4695 + }, + { + "epoch": 0.5973794682610355, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9822567701339722, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8430283069610596, + "num_tokens": 179233872.0, + "step": 4696 + }, + { + "epoch": 0.597506678539626, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7962485551834106, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8793182373046875, + "num_tokens": 179272721.0, + "step": 4697 + }, + { + "epoch": 0.5976338888182166, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.032561779022217, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8571065068244934, + "num_tokens": 179305781.0, + "step": 4698 + }, + { + "epoch": 0.597761099096807, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.073669672012329, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8625516891479492, + "num_tokens": 179350049.0, + "step": 4699 + }, + { + "epoch": 0.5978883093753975, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.902197003364563, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8581504821777344, + "num_tokens": 179388063.0, + "step": 4700 + }, + { + "epoch": 0.598015519653988, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8103677034378052, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8654656410217285, + "num_tokens": 179427607.0, + "step": 4701 + }, + { + "epoch": 0.5981427299325786, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9359655380249023, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8412887454032898, + "num_tokens": 179471456.0, + "step": 4702 + }, + { + "epoch": 0.5982699402111691, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.797029733657837, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8694912791252136, + "num_tokens": 179514318.0, + "step": 4703 + }, + { + "epoch": 0.5983971504897596, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7103519439697266, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8715925216674805, + "num_tokens": 179553307.0, + "step": 4704 + }, + { + "epoch": 0.59852436076835, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7379872798919678, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8503014445304871, + "num_tokens": 179597288.0, + "step": 4705 + }, + { + "epoch": 0.5986515710469406, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.188397169113159, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8582112193107605, + "num_tokens": 179629583.0, + "step": 4706 + }, + { + "epoch": 0.5987787813255311, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9716469049453735, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8522485494613647, + "num_tokens": 179672104.0, + "step": 4707 + }, + { + "epoch": 0.5989059916041216, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8571876287460327, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8499897122383118, + "num_tokens": 179712951.0, + "step": 4708 + }, + { + "epoch": 0.5990332018827121, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.752612590789795, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8575522899627686, + "num_tokens": 179754626.0, + "step": 4709 + }, + { + "epoch": 0.5991604121613027, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9721325635910034, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8635768294334412, + "num_tokens": 179790068.0, + "step": 4710 + }, + { + "epoch": 0.5992876224398932, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.109210252761841, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8384513854980469, + "num_tokens": 179823936.0, + "step": 4711 + }, + { + "epoch": 0.5994148327184836, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9815312623977661, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8602925539016724, + "num_tokens": 179860920.0, + "step": 4712 + }, + { + "epoch": 0.5995420429970741, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9611284732818604, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8488544821739197, + "num_tokens": 179899848.0, + "step": 4713 + }, + { + "epoch": 0.5996692532756647, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9407626390457153, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8756073713302612, + "num_tokens": 179934462.0, + "step": 4714 + }, + { + "epoch": 0.5997964635542552, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.902085542678833, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8573427200317383, + "num_tokens": 179973955.0, + "step": 4715 + }, + { + "epoch": 0.5999236738328457, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.83670973777771, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8596746325492859, + "num_tokens": 180015256.0, + "step": 4716 + }, + { + "epoch": 0.6000508841114363, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.917302131652832, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8666322231292725, + "num_tokens": 180054145.0, + "step": 4717 + }, + { + "epoch": 0.6001780943900267, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8117172718048096, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8630115985870361, + "num_tokens": 180093411.0, + "step": 4718 + }, + { + "epoch": 0.6003053046686172, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.802726149559021, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8686810731887817, + "num_tokens": 180135799.0, + "step": 4719 + }, + { + "epoch": 0.6004325149472077, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9179604053497314, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8530833721160889, + "num_tokens": 180172623.0, + "step": 4720 + }, + { + "epoch": 0.6005597252257983, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7552731037139893, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8546444773674011, + "num_tokens": 180213833.0, + "step": 4721 + }, + { + "epoch": 0.6006869355043888, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7763137817382812, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.865839958190918, + "num_tokens": 180254982.0, + "step": 4722 + }, + { + "epoch": 0.6008141457829793, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.924359917640686, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8675562143325806, + "num_tokens": 180288544.0, + "step": 4723 + }, + { + "epoch": 0.6009413560615697, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8358081579208374, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8546844720840454, + "num_tokens": 180330233.0, + "step": 4724 + }, + { + "epoch": 0.6010685663401603, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9736225605010986, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.860731840133667, + "num_tokens": 180361516.0, + "step": 4725 + }, + { + "epoch": 0.6011957766187508, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.922741174697876, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8568840026855469, + "num_tokens": 180404380.0, + "step": 4726 + }, + { + "epoch": 0.6013229868973413, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.044219732284546, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8515656590461731, + "num_tokens": 180436916.0, + "step": 4727 + }, + { + "epoch": 0.6014501971759318, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8315105438232422, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.841016411781311, + "num_tokens": 180476895.0, + "step": 4728 + }, + { + "epoch": 0.6015774074545224, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.93975830078125, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8583385348320007, + "num_tokens": 180515248.0, + "step": 4729 + }, + { + "epoch": 0.6017046177331128, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8526413440704346, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8655644059181213, + "num_tokens": 180554575.0, + "step": 4730 + }, + { + "epoch": 0.6018318280117033, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8494304418563843, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.849114179611206, + "num_tokens": 180594485.0, + "step": 4731 + }, + { + "epoch": 0.6019590382902938, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8594040870666504, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8439648151397705, + "num_tokens": 180634746.0, + "step": 4732 + }, + { + "epoch": 0.6020862485688844, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9100972414016724, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8489356637001038, + "num_tokens": 180674946.0, + "step": 4733 + }, + { + "epoch": 0.6022134588474749, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.80003023147583, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8586223125457764, + "num_tokens": 180715538.0, + "step": 4734 + }, + { + "epoch": 0.6023406691260654, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8202698230743408, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8581441640853882, + "num_tokens": 180755298.0, + "step": 4735 + }, + { + "epoch": 0.6024678794046558, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7642455101013184, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8520923256874084, + "num_tokens": 180797104.0, + "step": 4736 + }, + { + "epoch": 0.6025950896832464, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7049661874771118, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8691694140434265, + "num_tokens": 180839845.0, + "step": 4737 + }, + { + "epoch": 0.6027222999618369, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.782577395439148, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.871872067451477, + "num_tokens": 180878849.0, + "step": 4738 + }, + { + "epoch": 0.6028495102404274, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7949440479278564, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8577312231063843, + "num_tokens": 180918427.0, + "step": 4739 + }, + { + "epoch": 0.602976720519018, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9361164569854736, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8513059616088867, + "num_tokens": 180956206.0, + "step": 4740 + }, + { + "epoch": 0.6031039307976085, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8290129899978638, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8648663759231567, + "num_tokens": 180996945.0, + "step": 4741 + }, + { + "epoch": 0.6032311410761989, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8155086040496826, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8633352518081665, + "num_tokens": 181031763.0, + "step": 4742 + }, + { + "epoch": 0.6033583513547894, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7714743614196777, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8658649325370789, + "num_tokens": 181070661.0, + "step": 4743 + }, + { + "epoch": 0.60348556163338, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.825485110282898, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8460857272148132, + "num_tokens": 181112769.0, + "step": 4744 + }, + { + "epoch": 0.6036127719119705, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8110805749893188, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8602520823478699, + "num_tokens": 181151155.0, + "step": 4745 + }, + { + "epoch": 0.603739982190561, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.810899257659912, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8587704300880432, + "num_tokens": 181188689.0, + "step": 4746 + }, + { + "epoch": 0.6038671924691515, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8086929321289062, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8631419539451599, + "num_tokens": 181230031.0, + "step": 4747 + }, + { + "epoch": 0.603994402747742, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8053659200668335, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8582768440246582, + "num_tokens": 181271464.0, + "step": 4748 + }, + { + "epoch": 0.6041216130263325, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8138465881347656, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8524544835090637, + "num_tokens": 181312607.0, + "step": 4749 + }, + { + "epoch": 0.604248823304923, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0085930824279785, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8675470352172852, + "num_tokens": 181348628.0, + "step": 4750 + }, + { + "epoch": 0.6043760335835135, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8936767578125, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8510873317718506, + "num_tokens": 181387975.0, + "step": 4751 + }, + { + "epoch": 0.6045032438621041, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8604106903076172, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8716570734977722, + "num_tokens": 181424163.0, + "step": 4752 + }, + { + "epoch": 0.6046304541406946, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9917042255401611, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8473538756370544, + "num_tokens": 181457002.0, + "step": 4753 + }, + { + "epoch": 0.604757664419285, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9346791505813599, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8545375466346741, + "num_tokens": 181494410.0, + "step": 4754 + }, + { + "epoch": 0.6048848746978756, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9540863037109375, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8558108806610107, + "num_tokens": 181534595.0, + "step": 4755 + }, + { + "epoch": 0.6050120849764661, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9298923015594482, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8655993938446045, + "num_tokens": 181570654.0, + "step": 4756 + }, + { + "epoch": 0.6051392952550566, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9073492288589478, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8525780439376831, + "num_tokens": 181608366.0, + "step": 4757 + }, + { + "epoch": 0.6052665055336471, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8651317358016968, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8470745086669922, + "num_tokens": 181647168.0, + "step": 4758 + }, + { + "epoch": 0.6053937158122377, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.899137258529663, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8611047267913818, + "num_tokens": 181689177.0, + "step": 4759 + }, + { + "epoch": 0.6055209260908282, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0008277893066406, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8626512289047241, + "num_tokens": 181723742.0, + "step": 4760 + }, + { + "epoch": 0.6056481363694186, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.857085108757019, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8679075241088867, + "num_tokens": 181761539.0, + "step": 4761 + }, + { + "epoch": 0.6057753466480091, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.2070953845977783, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8560347557067871, + "num_tokens": 181799674.0, + "step": 4762 + }, + { + "epoch": 0.6059025569265997, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.940085530281067, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8465379476547241, + "num_tokens": 181838784.0, + "step": 4763 + }, + { + "epoch": 0.6060297672051902, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8348780870437622, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.854026198387146, + "num_tokens": 181875669.0, + "step": 4764 + }, + { + "epoch": 0.6061569774837807, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.990241527557373, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8416028022766113, + "num_tokens": 181912450.0, + "step": 4765 + }, + { + "epoch": 0.6062841877623713, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.900341272354126, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.857823371887207, + "num_tokens": 181947691.0, + "step": 4766 + }, + { + "epoch": 0.6064113980409617, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9531821012496948, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8626314401626587, + "num_tokens": 181982364.0, + "step": 4767 + }, + { + "epoch": 0.6065386083195522, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.004809856414795, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.858229398727417, + "num_tokens": 182017071.0, + "step": 4768 + }, + { + "epoch": 0.6066658185981427, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8946510553359985, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.858492910861969, + "num_tokens": 182057234.0, + "step": 4769 + }, + { + "epoch": 0.6067930288767333, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0618350505828857, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8710136413574219, + "num_tokens": 182092450.0, + "step": 4770 + }, + { + "epoch": 0.6069202391553238, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0127124786376953, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8508350849151611, + "num_tokens": 182126398.0, + "step": 4771 + }, + { + "epoch": 0.6070474494339143, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.030618190765381, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8732336163520813, + "num_tokens": 182160848.0, + "step": 4772 + }, + { + "epoch": 0.6071746597125047, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7629790306091309, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8678185939788818, + "num_tokens": 182202596.0, + "step": 4773 + }, + { + "epoch": 0.6073018699910953, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8550056219100952, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8567491769790649, + "num_tokens": 182239235.0, + "step": 4774 + }, + { + "epoch": 0.6074290802696858, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8981902599334717, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8707663416862488, + "num_tokens": 182273662.0, + "step": 4775 + }, + { + "epoch": 0.6075562905482763, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9017231464385986, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8439251184463501, + "num_tokens": 182311991.0, + "step": 4776 + }, + { + "epoch": 0.6076835008268668, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.952518105506897, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8604109287261963, + "num_tokens": 182350722.0, + "step": 4777 + }, + { + "epoch": 0.6078107111054574, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9889769554138184, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8609246015548706, + "num_tokens": 182386755.0, + "step": 4778 + }, + { + "epoch": 0.6079379213840478, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8431352376937866, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8748301267623901, + "num_tokens": 182423128.0, + "step": 4779 + }, + { + "epoch": 0.6080651316626383, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.892594337463379, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8457189798355103, + "num_tokens": 182461518.0, + "step": 4780 + }, + { + "epoch": 0.6081923419412288, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9829392433166504, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8589755892753601, + "num_tokens": 182495707.0, + "step": 4781 + }, + { + "epoch": 0.6083195522198194, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.054619789123535, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8491817712783813, + "num_tokens": 182539058.0, + "step": 4782 + }, + { + "epoch": 0.6084467624984099, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.147533893585205, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8449599742889404, + "num_tokens": 182572482.0, + "step": 4783 + }, + { + "epoch": 0.6085739727770004, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.917934536933899, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8644242882728577, + "num_tokens": 182607665.0, + "step": 4784 + }, + { + "epoch": 0.6087011830555908, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1342241764068604, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8596953749656677, + "num_tokens": 182650503.0, + "step": 4785 + }, + { + "epoch": 0.6088283933341814, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8895092010498047, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8701699376106262, + "num_tokens": 182681662.0, + "step": 4786 + }, + { + "epoch": 0.6089556036127719, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.170588254928589, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8476218581199646, + "num_tokens": 182714912.0, + "step": 4787 + }, + { + "epoch": 0.6090828138913624, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.123812437057495, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8553558588027954, + "num_tokens": 182748813.0, + "step": 4788 + }, + { + "epoch": 0.609210024169953, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1179165840148926, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8431553840637207, + "num_tokens": 182786123.0, + "step": 4789 + }, + { + "epoch": 0.6093372344485435, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.0585944652557373, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8719313144683838, + "num_tokens": 182820550.0, + "step": 4790 + }, + { + "epoch": 0.6094644447271339, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9120240211486816, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8590391874313354, + "num_tokens": 182863135.0, + "step": 4791 + }, + { + "epoch": 0.6095916550057244, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9452271461486816, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8605236411094666, + "num_tokens": 182905193.0, + "step": 4792 + }, + { + "epoch": 0.609718865284315, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7812187671661377, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8588396310806274, + "num_tokens": 182944934.0, + "step": 4793 + }, + { + "epoch": 0.6098460755629055, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.859744906425476, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8464255332946777, + "num_tokens": 182984604.0, + "step": 4794 + }, + { + "epoch": 0.609973285841496, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.869132161140442, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8569929599761963, + "num_tokens": 183020421.0, + "step": 4795 + }, + { + "epoch": 0.6101004961200865, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9280372858047485, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8501360416412354, + "num_tokens": 183057635.0, + "step": 4796 + }, + { + "epoch": 0.610227706398677, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8903089761734009, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8465179204940796, + "num_tokens": 183094689.0, + "step": 4797 + }, + { + "epoch": 0.6103549166772675, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0349063873291016, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8634187579154968, + "num_tokens": 183129118.0, + "step": 4798 + }, + { + "epoch": 0.610482126955858, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9630025625228882, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8578721284866333, + "num_tokens": 183163365.0, + "step": 4799 + }, + { + "epoch": 0.6106093372344485, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.918436884880066, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8617703914642334, + "num_tokens": 183199271.0, + "step": 4800 + }, + { + "epoch": 0.6107365475130391, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8841735124588013, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8774163722991943, + "num_tokens": 183234523.0, + "step": 4801 + }, + { + "epoch": 0.6108637577916296, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9740575551986694, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.859198808670044, + "num_tokens": 183267098.0, + "step": 4802 + }, + { + "epoch": 0.61099096807022, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8800803422927856, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8594098687171936, + "num_tokens": 183306825.0, + "step": 4803 + }, + { + "epoch": 0.6111181783488105, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8660355806350708, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8589152693748474, + "num_tokens": 183348712.0, + "step": 4804 + }, + { + "epoch": 0.6112453886274011, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.861916184425354, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8700361847877502, + "num_tokens": 183383797.0, + "step": 4805 + }, + { + "epoch": 0.6113725989059916, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9145399332046509, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8703058958053589, + "num_tokens": 183417484.0, + "step": 4806 + }, + { + "epoch": 0.6114998091845821, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.880240559577942, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8653597235679626, + "num_tokens": 183453808.0, + "step": 4807 + }, + { + "epoch": 0.6116270194631727, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.782045602798462, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8661273717880249, + "num_tokens": 183496693.0, + "step": 4808 + }, + { + "epoch": 0.6117542297417632, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.866776943206787, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8614886999130249, + "num_tokens": 183536727.0, + "step": 4809 + }, + { + "epoch": 0.6118814400203536, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.879128098487854, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.859754204750061, + "num_tokens": 183571809.0, + "step": 4810 + }, + { + "epoch": 0.6120086502989441, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 3.9660327434539795, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.858202338218689, + "num_tokens": 183610761.0, + "step": 4811 + }, + { + "epoch": 0.6121358605775347, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1068131923675537, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8558008670806885, + "num_tokens": 183648304.0, + "step": 4812 + }, + { + "epoch": 0.6122630708561252, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9948723316192627, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8539886474609375, + "num_tokens": 183684201.0, + "step": 4813 + }, + { + "epoch": 0.6123902811347157, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8123713731765747, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8519527912139893, + "num_tokens": 183726595.0, + "step": 4814 + }, + { + "epoch": 0.6125174914133062, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8717427253723145, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8687872886657715, + "num_tokens": 183762222.0, + "step": 4815 + }, + { + "epoch": 0.6126447016918967, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9640578031539917, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8710579872131348, + "num_tokens": 183797167.0, + "step": 4816 + }, + { + "epoch": 0.6127719119704872, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8126051425933838, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8570646643638611, + "num_tokens": 183839591.0, + "step": 4817 + }, + { + "epoch": 0.6128991222490777, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7721667289733887, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8596819639205933, + "num_tokens": 183877862.0, + "step": 4818 + }, + { + "epoch": 0.6130263325276682, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8536514043807983, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8638613224029541, + "num_tokens": 183915964.0, + "step": 4819 + }, + { + "epoch": 0.6131535428062588, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.050276279449463, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8557851314544678, + "num_tokens": 183952823.0, + "step": 4820 + }, + { + "epoch": 0.6132807530848493, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8670427799224854, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8505078554153442, + "num_tokens": 183991650.0, + "step": 4821 + }, + { + "epoch": 0.6134079633634397, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8441389799118042, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8600211143493652, + "num_tokens": 184027768.0, + "step": 4822 + }, + { + "epoch": 0.6135351736420303, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8807621002197266, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8710317015647888, + "num_tokens": 184063754.0, + "step": 4823 + }, + { + "epoch": 0.6136623839206208, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7673550844192505, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8532301187515259, + "num_tokens": 184105286.0, + "step": 4824 + }, + { + "epoch": 0.6137895941992113, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.837540626525879, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8644523620605469, + "num_tokens": 184139312.0, + "step": 4825 + }, + { + "epoch": 0.6139168044778018, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.783020257949829, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8579140901565552, + "num_tokens": 184181541.0, + "step": 4826 + }, + { + "epoch": 0.6140440147563924, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0157697200775146, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8595796227455139, + "num_tokens": 184216857.0, + "step": 4827 + }, + { + "epoch": 0.6141712250349828, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.2823474407196045, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8632490634918213, + "num_tokens": 184253318.0, + "step": 4828 + }, + { + "epoch": 0.6142984353135733, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8474886417388916, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8701916933059692, + "num_tokens": 184298625.0, + "step": 4829 + }, + { + "epoch": 0.6144256455921638, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0842719078063965, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8571729063987732, + "num_tokens": 184334569.0, + "step": 4830 + }, + { + "epoch": 0.6145528558707544, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.80088472366333, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8414649963378906, + "num_tokens": 184377477.0, + "step": 4831 + }, + { + "epoch": 0.6146800661493449, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9446582794189453, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8465811014175415, + "num_tokens": 184416671.0, + "step": 4832 + }, + { + "epoch": 0.6148072764279354, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9433354139328003, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8542546033859253, + "num_tokens": 184453174.0, + "step": 4833 + }, + { + "epoch": 0.6149344867065258, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9544649124145508, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.865817666053772, + "num_tokens": 184488313.0, + "step": 4834 + }, + { + "epoch": 0.6150616969851164, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.052077293395996, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8387973308563232, + "num_tokens": 184523733.0, + "step": 4835 + }, + { + "epoch": 0.6151889072637069, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7452195882797241, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8563312292098999, + "num_tokens": 184568399.0, + "step": 4836 + }, + { + "epoch": 0.6153161175422974, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0278801918029785, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8579854965209961, + "num_tokens": 184603302.0, + "step": 4837 + }, + { + "epoch": 0.615443327820888, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.868134617805481, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8793187141418457, + "num_tokens": 184637346.0, + "step": 4838 + }, + { + "epoch": 0.6155705380994785, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9650342464447021, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8666825294494629, + "num_tokens": 184671687.0, + "step": 4839 + }, + { + "epoch": 0.6156977483780689, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9920228719711304, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8687968850135803, + "num_tokens": 184706867.0, + "step": 4840 + }, + { + "epoch": 0.6158249586566594, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0281224250793457, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8550735712051392, + "num_tokens": 184743916.0, + "step": 4841 + }, + { + "epoch": 0.61595216893525, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9107484817504883, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8678907155990601, + "num_tokens": 184783247.0, + "step": 4842 + }, + { + "epoch": 0.6160793792138405, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7048845291137695, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8603721857070923, + "num_tokens": 184826501.0, + "step": 4843 + }, + { + "epoch": 0.616206589492431, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7727667093276978, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8581008315086365, + "num_tokens": 184868743.0, + "step": 4844 + }, + { + "epoch": 0.6163337997710215, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8368831872940063, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8543272018432617, + "num_tokens": 184908992.0, + "step": 4845 + }, + { + "epoch": 0.616461010049612, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9532513618469238, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8478083610534668, + "num_tokens": 184942456.0, + "step": 4846 + }, + { + "epoch": 0.6165882203282025, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.101372003555298, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8495275974273682, + "num_tokens": 184975834.0, + "step": 4847 + }, + { + "epoch": 0.616715430606793, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9289640188217163, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8817939758300781, + "num_tokens": 185012295.0, + "step": 4848 + }, + { + "epoch": 0.6168426408853835, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.082359790802002, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8631399869918823, + "num_tokens": 185049272.0, + "step": 4849 + }, + { + "epoch": 0.6169698511639741, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.005919933319092, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8705007433891296, + "num_tokens": 185081054.0, + "step": 4850 + }, + { + "epoch": 0.6170970614425646, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7987027168273926, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8545268774032593, + "num_tokens": 185119809.0, + "step": 4851 + }, + { + "epoch": 0.617224271721155, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.701965093612671, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8693076372146606, + "num_tokens": 185161462.0, + "step": 4852 + }, + { + "epoch": 0.6173514819997455, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.9897736310958862, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8501932621002197, + "num_tokens": 185196238.0, + "step": 4853 + }, + { + "epoch": 0.6174786922783361, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8677875995635986, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8582624793052673, + "num_tokens": 185239317.0, + "step": 4854 + }, + { + "epoch": 0.6176059025569266, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.851042628288269, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8683912754058838, + "num_tokens": 185273555.0, + "step": 4855 + }, + { + "epoch": 0.6177331128355171, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.986782431602478, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8660439252853394, + "num_tokens": 185309721.0, + "step": 4856 + }, + { + "epoch": 0.6178603231141077, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8412435054779053, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8522483110427856, + "num_tokens": 185350846.0, + "step": 4857 + }, + { + "epoch": 0.6179875333926982, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9226239919662476, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8524401187896729, + "num_tokens": 185392472.0, + "step": 4858 + }, + { + "epoch": 0.6181147436712886, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8075199127197266, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8553897738456726, + "num_tokens": 185438250.0, + "step": 4859 + }, + { + "epoch": 0.6182419539498791, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8995038270950317, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8562188148498535, + "num_tokens": 185479455.0, + "step": 4860 + }, + { + "epoch": 0.6183691642284697, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8763192892074585, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8553978204727173, + "num_tokens": 185518719.0, + "step": 4861 + }, + { + "epoch": 0.6184963745070602, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0690717697143555, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8442820310592651, + "num_tokens": 185556671.0, + "step": 4862 + }, + { + "epoch": 0.6186235847856507, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9174176454544067, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8559726476669312, + "num_tokens": 185595063.0, + "step": 4863 + }, + { + "epoch": 0.6187507950642412, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8616856336593628, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8495903015136719, + "num_tokens": 185635626.0, + "step": 4864 + }, + { + "epoch": 0.6188780053428317, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9778568744659424, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8485738635063171, + "num_tokens": 185671687.0, + "step": 4865 + }, + { + "epoch": 0.6190052156214222, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9743611812591553, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8631311655044556, + "num_tokens": 185706459.0, + "step": 4866 + }, + { + "epoch": 0.6191324259000127, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7791187763214111, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8768006563186646, + "num_tokens": 185743162.0, + "step": 4867 + }, + { + "epoch": 0.6192596361786032, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9072798490524292, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8445820808410645, + "num_tokens": 185784036.0, + "step": 4868 + }, + { + "epoch": 0.6193868464571938, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.035991668701172, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8553478717803955, + "num_tokens": 185821407.0, + "step": 4869 + }, + { + "epoch": 0.6195140567357843, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.894985318183899, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8651738166809082, + "num_tokens": 185857789.0, + "step": 4870 + }, + { + "epoch": 0.6196412670143747, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9203842878341675, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8707379102706909, + "num_tokens": 185896989.0, + "step": 4871 + }, + { + "epoch": 0.6197684772929652, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.803694248199463, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.873454749584198, + "num_tokens": 185938123.0, + "step": 4872 + }, + { + "epoch": 0.6198956875715558, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.947713851928711, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8589465618133545, + "num_tokens": 185973440.0, + "step": 4873 + }, + { + "epoch": 0.6200228978501463, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9344043731689453, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8604326844215393, + "num_tokens": 186010069.0, + "step": 4874 + }, + { + "epoch": 0.6201501081287368, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9392428398132324, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.853134036064148, + "num_tokens": 186048365.0, + "step": 4875 + }, + { + "epoch": 0.6202773184073274, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9783101081848145, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8644406795501709, + "num_tokens": 186085358.0, + "step": 4876 + }, + { + "epoch": 0.6204045286859178, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9911679029464722, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8630307912826538, + "num_tokens": 186118476.0, + "step": 4877 + }, + { + "epoch": 0.6205317389645083, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.012779474258423, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8553552627563477, + "num_tokens": 186155487.0, + "step": 4878 + }, + { + "epoch": 0.6206589492430988, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8321993350982666, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.865050196647644, + "num_tokens": 186192941.0, + "step": 4879 + }, + { + "epoch": 0.6207861595216894, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8805590867996216, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8622515201568604, + "num_tokens": 186233860.0, + "step": 4880 + }, + { + "epoch": 0.6209133698002799, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.174670457839966, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.848328173160553, + "num_tokens": 186273081.0, + "step": 4881 + }, + { + "epoch": 0.6210405800788704, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8598085641860962, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8639029264450073, + "num_tokens": 186310596.0, + "step": 4882 + }, + { + "epoch": 0.6211677903574608, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9303898811340332, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8533624410629272, + "num_tokens": 186346262.0, + "step": 4883 + }, + { + "epoch": 0.6212950006360514, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8799052238464355, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8614766597747803, + "num_tokens": 186386508.0, + "step": 4884 + }, + { + "epoch": 0.6214222109146419, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0014352798461914, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8551980257034302, + "num_tokens": 186425184.0, + "step": 4885 + }, + { + "epoch": 0.6215494211932324, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.903533935546875, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8581660985946655, + "num_tokens": 186463072.0, + "step": 4886 + }, + { + "epoch": 0.621676631471823, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8610848188400269, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8688555955886841, + "num_tokens": 186504062.0, + "step": 4887 + }, + { + "epoch": 0.6218038417504135, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8960613012313843, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.849181056022644, + "num_tokens": 186544564.0, + "step": 4888 + }, + { + "epoch": 0.6219310520290039, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8741586208343506, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8445208072662354, + "num_tokens": 186587196.0, + "step": 4889 + }, + { + "epoch": 0.6220582623075944, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0373635292053223, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8602452874183655, + "num_tokens": 186620673.0, + "step": 4890 + }, + { + "epoch": 0.622185472586185, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.836798071861267, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8619745969772339, + "num_tokens": 186654028.0, + "step": 4891 + }, + { + "epoch": 0.6223126828647755, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.831611156463623, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8563165664672852, + "num_tokens": 186693395.0, + "step": 4892 + }, + { + "epoch": 0.622439893143366, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8955475091934204, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8412594199180603, + "num_tokens": 186734440.0, + "step": 4893 + }, + { + "epoch": 0.6225671034219565, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0564332008361816, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.862259030342102, + "num_tokens": 186769637.0, + "step": 4894 + }, + { + "epoch": 0.622694313700547, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.793357253074646, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8549075126647949, + "num_tokens": 186810188.0, + "step": 4895 + }, + { + "epoch": 0.6228215239791375, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8953590393066406, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8490576148033142, + "num_tokens": 186847512.0, + "step": 4896 + }, + { + "epoch": 0.622948734257728, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.1104371547698975, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8678102493286133, + "num_tokens": 186878721.0, + "step": 4897 + }, + { + "epoch": 0.6230759445363185, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8600345849990845, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8632882833480835, + "num_tokens": 186920239.0, + "step": 4898 + }, + { + "epoch": 0.6232031548149091, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8242592811584473, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8532361388206482, + "num_tokens": 186964158.0, + "step": 4899 + }, + { + "epoch": 0.6233303650934996, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.890410304069519, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8740212917327881, + "num_tokens": 187003176.0, + "step": 4900 + }, + { + "epoch": 0.62345757537209, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7617932558059692, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.85798579454422, + "num_tokens": 187046152.0, + "step": 4901 + }, + { + "epoch": 0.6235847856506805, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8830302953720093, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8713457584381104, + "num_tokens": 187086629.0, + "step": 4902 + }, + { + "epoch": 0.6237119959292711, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9083908796310425, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8525923490524292, + "num_tokens": 187122856.0, + "step": 4903 + }, + { + "epoch": 0.6238392062078616, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.92693030834198, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8730885982513428, + "num_tokens": 187161568.0, + "step": 4904 + }, + { + "epoch": 0.6239664164864521, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8433992862701416, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8595041036605835, + "num_tokens": 187202831.0, + "step": 4905 + }, + { + "epoch": 0.6240936267650427, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.000683069229126, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8656631112098694, + "num_tokens": 187239059.0, + "step": 4906 + }, + { + "epoch": 0.6242208370436332, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.919337511062622, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8543440103530884, + "num_tokens": 187272940.0, + "step": 4907 + }, + { + "epoch": 0.6243480473222236, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.047715425491333, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8723679780960083, + "num_tokens": 187310963.0, + "step": 4908 + }, + { + "epoch": 0.6244752576008141, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8148565292358398, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8783425688743591, + "num_tokens": 187345698.0, + "step": 4909 + }, + { + "epoch": 0.6246024678794047, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8699557781219482, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8559498190879822, + "num_tokens": 187384762.0, + "step": 4910 + }, + { + "epoch": 0.6247296781579952, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8747916221618652, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8489371538162231, + "num_tokens": 187424288.0, + "step": 4911 + }, + { + "epoch": 0.6248568884365857, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7697139978408813, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8680719137191772, + "num_tokens": 187461481.0, + "step": 4912 + }, + { + "epoch": 0.6249840987151762, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8274379968643188, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8731207251548767, + "num_tokens": 187497690.0, + "step": 4913 + }, + { + "epoch": 0.6251113089937667, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.1226136684417725, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8534271717071533, + "num_tokens": 187537825.0, + "step": 4914 + }, + { + "epoch": 0.6252385192723572, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7148233652114868, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8691299557685852, + "num_tokens": 187583998.0, + "step": 4915 + }, + { + "epoch": 0.6253657295509477, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9271520376205444, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.85774827003479, + "num_tokens": 187621962.0, + "step": 4916 + }, + { + "epoch": 0.6254929398295382, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9561758041381836, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8662952184677124, + "num_tokens": 187655913.0, + "step": 4917 + }, + { + "epoch": 0.6256201501081288, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.004685163497925, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8502506017684937, + "num_tokens": 187692433.0, + "step": 4918 + }, + { + "epoch": 0.6257473603867193, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.162602186203003, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.870899498462677, + "num_tokens": 187724311.0, + "step": 4919 + }, + { + "epoch": 0.6258745706653097, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8426337242126465, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8670640587806702, + "num_tokens": 187766614.0, + "step": 4920 + }, + { + "epoch": 0.6260017809439002, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.845503330230713, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8708226680755615, + "num_tokens": 187804847.0, + "step": 4921 + }, + { + "epoch": 0.6261289912224908, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8394591808319092, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8608611822128296, + "num_tokens": 187842791.0, + "step": 4922 + }, + { + "epoch": 0.6262562015010813, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.034990072250366, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.850758969783783, + "num_tokens": 187878304.0, + "step": 4923 + }, + { + "epoch": 0.6263834117796718, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.943834900856018, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8609021902084351, + "num_tokens": 187915022.0, + "step": 4924 + }, + { + "epoch": 0.6265106220582624, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9532893896102905, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.858561635017395, + "num_tokens": 187950533.0, + "step": 4925 + }, + { + "epoch": 0.6266378323368528, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9228535890579224, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8556475043296814, + "num_tokens": 187989893.0, + "step": 4926 + }, + { + "epoch": 0.6267650426154433, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9699689149856567, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8532204627990723, + "num_tokens": 188025031.0, + "step": 4927 + }, + { + "epoch": 0.6268922528940338, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9340871572494507, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8512619733810425, + "num_tokens": 188060797.0, + "step": 4928 + }, + { + "epoch": 0.6270194631726244, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.053506851196289, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8521552085876465, + "num_tokens": 188096271.0, + "step": 4929 + }, + { + "epoch": 0.6271466734512149, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8679991960525513, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8593938946723938, + "num_tokens": 188136697.0, + "step": 4930 + }, + { + "epoch": 0.6272738837298054, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9205495119094849, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8578150272369385, + "num_tokens": 188174715.0, + "step": 4931 + }, + { + "epoch": 0.6274010940083958, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8657537698745728, + "learning_rate": 1e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8369717597961426, + "num_tokens": 188216379.0, + "step": 4932 + }, + { + "epoch": 0.6275283042869864, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8847851753234863, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8647117614746094, + "num_tokens": 188252183.0, + "step": 4933 + }, + { + "epoch": 0.6276555145655769, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9207329750061035, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8616374731063843, + "num_tokens": 188291128.0, + "step": 4934 + }, + { + "epoch": 0.6277827248441674, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 5.086111545562744, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8420940637588501, + "num_tokens": 188334256.0, + "step": 4935 + }, + { + "epoch": 0.627909935122758, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.895821452140808, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8522064685821533, + "num_tokens": 188373207.0, + "step": 4936 + }, + { + "epoch": 0.6280371454013485, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8572889566421509, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8564246892929077, + "num_tokens": 188414937.0, + "step": 4937 + }, + { + "epoch": 0.6281643556799389, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8054713010787964, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8557097315788269, + "num_tokens": 188455398.0, + "step": 4938 + }, + { + "epoch": 0.6282915659585294, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.999027967453003, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.859145998954773, + "num_tokens": 188496561.0, + "step": 4939 + }, + { + "epoch": 0.62841877623712, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9500092267990112, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8634411096572876, + "num_tokens": 188531318.0, + "step": 4940 + }, + { + "epoch": 0.6285459865157105, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8809539079666138, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.849465548992157, + "num_tokens": 188566807.0, + "step": 4941 + }, + { + "epoch": 0.628673196794301, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.906571626663208, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8538621068000793, + "num_tokens": 188608062.0, + "step": 4942 + }, + { + "epoch": 0.6288004070728915, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9875729084014893, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8711943626403809, + "num_tokens": 188644953.0, + "step": 4943 + }, + { + "epoch": 0.628927617351482, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7410963773727417, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.874483585357666, + "num_tokens": 188687720.0, + "step": 4944 + }, + { + "epoch": 0.6290548276300725, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8042771816253662, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8525707125663757, + "num_tokens": 188730077.0, + "step": 4945 + }, + { + "epoch": 0.629182037908663, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7811436653137207, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.864619255065918, + "num_tokens": 188768306.0, + "step": 4946 + }, + { + "epoch": 0.6293092481872535, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.017961025238037, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.837624192237854, + "num_tokens": 188807480.0, + "step": 4947 + }, + { + "epoch": 0.6294364584658441, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.2434911727905273, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8495516180992126, + "num_tokens": 188836383.0, + "step": 4948 + }, + { + "epoch": 0.6295636687444346, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8187426328659058, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8624907732009888, + "num_tokens": 188876079.0, + "step": 4949 + }, + { + "epoch": 0.629690879023025, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9637746810913086, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8569012880325317, + "num_tokens": 188912936.0, + "step": 4950 + }, + { + "epoch": 0.6298180893016155, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9213128089904785, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8572930097579956, + "num_tokens": 188951475.0, + "step": 4951 + }, + { + "epoch": 0.6299452995802061, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8964592218399048, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8449411392211914, + "num_tokens": 188996972.0, + "step": 4952 + }, + { + "epoch": 0.6300725098587966, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8345451354980469, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8774200677871704, + "num_tokens": 189030706.0, + "step": 4953 + }, + { + "epoch": 0.6301997201373871, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8648602962493896, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8813419342041016, + "num_tokens": 189065638.0, + "step": 4954 + }, + { + "epoch": 0.6303269304159776, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.99619460105896, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8722442984580994, + "num_tokens": 189101747.0, + "step": 4955 + }, + { + "epoch": 0.6304541406945681, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8861846923828125, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8555639982223511, + "num_tokens": 189142965.0, + "step": 4956 + }, + { + "epoch": 0.6305813509731586, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.868620753288269, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8640152215957642, + "num_tokens": 189177822.0, + "step": 4957 + }, + { + "epoch": 0.6307085612517491, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9054633378982544, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8527487516403198, + "num_tokens": 189215710.0, + "step": 4958 + }, + { + "epoch": 0.6308357715303397, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.111886739730835, + "learning_rate": 1e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8309146165847778, + "num_tokens": 189250788.0, + "step": 4959 + }, + { + "epoch": 0.6309629818089302, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.2373876571655273, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.856067419052124, + "num_tokens": 189278451.0, + "step": 4960 + }, + { + "epoch": 0.6310901920875207, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8973745107650757, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.862486720085144, + "num_tokens": 189316857.0, + "step": 4961 + }, + { + "epoch": 0.6312174023661112, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.784925937652588, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8703910112380981, + "num_tokens": 189356600.0, + "step": 4962 + }, + { + "epoch": 0.6313446126447017, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9213714599609375, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.853615939617157, + "num_tokens": 189393134.0, + "step": 4963 + }, + { + "epoch": 0.6314718229232922, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9193159341812134, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8432140946388245, + "num_tokens": 189433431.0, + "step": 4964 + }, + { + "epoch": 0.6315990332018827, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9735968112945557, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8439798951148987, + "num_tokens": 189473738.0, + "step": 4965 + }, + { + "epoch": 0.6317262434804732, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.021630048751831, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8730711936950684, + "num_tokens": 189509930.0, + "step": 4966 + }, + { + "epoch": 0.6318534537590638, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8875391483306885, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.862544596195221, + "num_tokens": 189547801.0, + "step": 4967 + }, + { + "epoch": 0.6319806640376543, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.930013656616211, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8720945715904236, + "num_tokens": 189582226.0, + "step": 4968 + }, + { + "epoch": 0.6321078743162447, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8514522314071655, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8575374484062195, + "num_tokens": 189622915.0, + "step": 4969 + }, + { + "epoch": 0.6322350845948352, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.2245543003082275, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.866768479347229, + "num_tokens": 189658182.0, + "step": 4970 + }, + { + "epoch": 0.6323622948734258, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.858703851699829, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8608770370483398, + "num_tokens": 189696627.0, + "step": 4971 + }, + { + "epoch": 0.6324895051520163, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.060237407684326, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8563956618309021, + "num_tokens": 189729826.0, + "step": 4972 + }, + { + "epoch": 0.6326167154306068, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9042558670043945, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8634904026985168, + "num_tokens": 189771504.0, + "step": 4973 + }, + { + "epoch": 0.6327439257091974, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8312525749206543, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8765461444854736, + "num_tokens": 189809220.0, + "step": 4974 + }, + { + "epoch": 0.6328711359877878, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9949127435684204, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8678404688835144, + "num_tokens": 189849586.0, + "step": 4975 + }, + { + "epoch": 0.6329983462663783, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.072283983230591, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8592038750648499, + "num_tokens": 189885054.0, + "step": 4976 + }, + { + "epoch": 0.6331255565449688, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9185867309570312, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8516420722007751, + "num_tokens": 189918286.0, + "step": 4977 + }, + { + "epoch": 0.6332527668235594, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9290037155151367, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.840579628944397, + "num_tokens": 189956845.0, + "step": 4978 + }, + { + "epoch": 0.6333799771021499, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.809460997581482, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8766518831253052, + "num_tokens": 189994827.0, + "step": 4979 + }, + { + "epoch": 0.6335071873807404, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8112335205078125, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8544859886169434, + "num_tokens": 190036066.0, + "step": 4980 + }, + { + "epoch": 0.6336343976593308, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8375582695007324, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8712554574012756, + "num_tokens": 190073326.0, + "step": 4981 + }, + { + "epoch": 0.6337616079379214, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7993310689926147, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.857683539390564, + "num_tokens": 190109125.0, + "step": 4982 + }, + { + "epoch": 0.6338888182165119, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.863850474357605, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8654439449310303, + "num_tokens": 190148520.0, + "step": 4983 + }, + { + "epoch": 0.6340160284951024, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8355815410614014, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8627760410308838, + "num_tokens": 190184272.0, + "step": 4984 + }, + { + "epoch": 0.6341432387736929, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.2624175548553467, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8579856157302856, + "num_tokens": 190221483.0, + "step": 4985 + }, + { + "epoch": 0.6342704490522835, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7246112823486328, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8708188533782959, + "num_tokens": 190266430.0, + "step": 4986 + }, + { + "epoch": 0.6343976593308739, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.6856154203414917, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8718051910400391, + "num_tokens": 190308575.0, + "step": 4987 + }, + { + "epoch": 0.6345248696094644, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8436309099197388, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8692417144775391, + "num_tokens": 190344584.0, + "step": 4988 + }, + { + "epoch": 0.6346520798880549, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.980959415435791, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8683686256408691, + "num_tokens": 190376497.0, + "step": 4989 + }, + { + "epoch": 0.6347792901666455, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9552220106124878, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8488855361938477, + "num_tokens": 190416879.0, + "step": 4990 + }, + { + "epoch": 0.634906500445236, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7779830694198608, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8808860778808594, + "num_tokens": 190454684.0, + "step": 4991 + }, + { + "epoch": 0.6350337107238265, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.044100284576416, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8514260053634644, + "num_tokens": 190490524.0, + "step": 4992 + }, + { + "epoch": 0.635160921002417, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9264637231826782, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8669238090515137, + "num_tokens": 190526558.0, + "step": 4993 + }, + { + "epoch": 0.6352881312810075, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7539646625518799, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8595951199531555, + "num_tokens": 190571206.0, + "step": 4994 + }, + { + "epoch": 0.635415341559598, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8234398365020752, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8482284545898438, + "num_tokens": 190611480.0, + "step": 4995 + }, + { + "epoch": 0.6355425518381885, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0129435062408447, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8323318362236023, + "num_tokens": 190647699.0, + "step": 4996 + }, + { + "epoch": 0.6356697621167791, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.032428503036499, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.860558271408081, + "num_tokens": 190684863.0, + "step": 4997 + }, + { + "epoch": 0.6357969723953696, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0248031616210938, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8446667194366455, + "num_tokens": 190725006.0, + "step": 4998 + }, + { + "epoch": 0.63592418267396, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9731322526931763, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8563054800033569, + "num_tokens": 190757431.0, + "step": 4999 + }, + { + "epoch": 0.6360513929525505, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8553829193115234, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.864540696144104, + "num_tokens": 190793118.0, + "step": 5000 + }, + { + "epoch": 0.6361786032311411, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.766064167022705, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8691811561584473, + "num_tokens": 190837105.0, + "step": 5001 + }, + { + "epoch": 0.6363058135097316, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9996185302734375, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8537464141845703, + "num_tokens": 190875372.0, + "step": 5002 + }, + { + "epoch": 0.6364330237883221, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8998650312423706, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8645892143249512, + "num_tokens": 190921307.0, + "step": 5003 + }, + { + "epoch": 0.6365602340669126, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8987969160079956, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8598533868789673, + "num_tokens": 190967919.0, + "step": 5004 + }, + { + "epoch": 0.6366874443455031, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.808916687965393, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8590004444122314, + "num_tokens": 191005668.0, + "step": 5005 + }, + { + "epoch": 0.6368146546240936, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9547663927078247, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8498035669326782, + "num_tokens": 191044455.0, + "step": 5006 + }, + { + "epoch": 0.6369418649026841, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7732410430908203, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8649787902832031, + "num_tokens": 191086369.0, + "step": 5007 + }, + { + "epoch": 0.6370690751812746, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8831353187561035, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8554633259773254, + "num_tokens": 191127070.0, + "step": 5008 + }, + { + "epoch": 0.6371962854598652, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9088534116744995, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8736593723297119, + "num_tokens": 191163623.0, + "step": 5009 + }, + { + "epoch": 0.6373234957384557, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9233615398406982, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8560179471969604, + "num_tokens": 191196790.0, + "step": 5010 + }, + { + "epoch": 0.6374507060170462, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8719221353530884, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8566292524337769, + "num_tokens": 191230265.0, + "step": 5011 + }, + { + "epoch": 0.6375779162956366, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9948947429656982, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8522635698318481, + "num_tokens": 191266781.0, + "step": 5012 + }, + { + "epoch": 0.6377051265742272, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8571237325668335, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.848914384841919, + "num_tokens": 191305869.0, + "step": 5013 + }, + { + "epoch": 0.6378323368528177, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.040635347366333, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8488080501556396, + "num_tokens": 191343498.0, + "step": 5014 + }, + { + "epoch": 0.6379595471314082, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9493693113327026, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8441594839096069, + "num_tokens": 191383260.0, + "step": 5015 + }, + { + "epoch": 0.6380867574099988, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9182566404342651, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8504945635795593, + "num_tokens": 191417350.0, + "step": 5016 + }, + { + "epoch": 0.6382139676885893, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7934693098068237, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8610038161277771, + "num_tokens": 191454336.0, + "step": 5017 + }, + { + "epoch": 0.6383411779671797, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7906571626663208, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8607879877090454, + "num_tokens": 191489977.0, + "step": 5018 + }, + { + "epoch": 0.6384683882457702, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.0226974487304688, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8495126366615295, + "num_tokens": 191524622.0, + "step": 5019 + }, + { + "epoch": 0.6385955985243608, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9526829719543457, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8491129279136658, + "num_tokens": 191565389.0, + "step": 5020 + }, + { + "epoch": 0.6387228088029513, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.768524408340454, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8786537051200867, + "num_tokens": 191604640.0, + "step": 5021 + }, + { + "epoch": 0.6388500190815418, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8251352310180664, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8578327298164368, + "num_tokens": 191651029.0, + "step": 5022 + }, + { + "epoch": 0.6389772293601323, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9375393390655518, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8615462779998779, + "num_tokens": 191690524.0, + "step": 5023 + }, + { + "epoch": 0.6391044396387228, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8251041173934937, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8550525903701782, + "num_tokens": 191733992.0, + "step": 5024 + }, + { + "epoch": 0.6392316499173133, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9498481750488281, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8697018623352051, + "num_tokens": 191769717.0, + "step": 5025 + }, + { + "epoch": 0.6393588601959038, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.904380440711975, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8448845744132996, + "num_tokens": 191813334.0, + "step": 5026 + }, + { + "epoch": 0.6394860704744944, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9801784753799438, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8628232479095459, + "num_tokens": 191848295.0, + "step": 5027 + }, + { + "epoch": 0.6396132807530849, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8520840406417847, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8531771898269653, + "num_tokens": 191885329.0, + "step": 5028 + }, + { + "epoch": 0.6397404910316754, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0678317546844482, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8722196817398071, + "num_tokens": 191918070.0, + "step": 5029 + }, + { + "epoch": 0.6398677013102658, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.713089942932129, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.863897979259491, + "num_tokens": 191966084.0, + "step": 5030 + }, + { + "epoch": 0.6399949115888564, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9068636894226074, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8597618341445923, + "num_tokens": 192004907.0, + "step": 5031 + }, + { + "epoch": 0.6401221218674469, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9774792194366455, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8410508632659912, + "num_tokens": 192042942.0, + "step": 5032 + }, + { + "epoch": 0.6402493321460374, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.762834072113037, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8610758185386658, + "num_tokens": 192081133.0, + "step": 5033 + }, + { + "epoch": 0.6403765424246279, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8704946041107178, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.860439658164978, + "num_tokens": 192120171.0, + "step": 5034 + }, + { + "epoch": 0.6405037527032185, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9476985931396484, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8594724535942078, + "num_tokens": 192159583.0, + "step": 5035 + }, + { + "epoch": 0.6406309629818089, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0230982303619385, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8572925329208374, + "num_tokens": 192199513.0, + "step": 5036 + }, + { + "epoch": 0.6407581732603994, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0213747024536133, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8617415428161621, + "num_tokens": 192228352.0, + "step": 5037 + }, + { + "epoch": 0.6408853835389899, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9075217247009277, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8544449806213379, + "num_tokens": 192264480.0, + "step": 5038 + }, + { + "epoch": 0.6410125938175805, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8661552667617798, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8583423495292664, + "num_tokens": 192303307.0, + "step": 5039 + }, + { + "epoch": 0.641139804096171, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.84763503074646, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8612127900123596, + "num_tokens": 192342064.0, + "step": 5040 + }, + { + "epoch": 0.6412670143747615, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8826420307159424, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8659676313400269, + "num_tokens": 192379297.0, + "step": 5041 + }, + { + "epoch": 0.6413942246533519, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.893278956413269, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8638671636581421, + "num_tokens": 192416087.0, + "step": 5042 + }, + { + "epoch": 0.6415214349319425, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7916687726974487, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8648765087127686, + "num_tokens": 192454421.0, + "step": 5043 + }, + { + "epoch": 0.641648645210533, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8428170680999756, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8556926250457764, + "num_tokens": 192493860.0, + "step": 5044 + }, + { + "epoch": 0.6417758554891235, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8068026304244995, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8516319394111633, + "num_tokens": 192533672.0, + "step": 5045 + }, + { + "epoch": 0.641903065767714, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9610008001327515, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8465073108673096, + "num_tokens": 192569569.0, + "step": 5046 + }, + { + "epoch": 0.6420302760463046, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7295221090316772, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8659462332725525, + "num_tokens": 192610716.0, + "step": 5047 + }, + { + "epoch": 0.642157486324895, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8386592864990234, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8522706031799316, + "num_tokens": 192648040.0, + "step": 5048 + }, + { + "epoch": 0.6422846966034855, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9379726648330688, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8586139678955078, + "num_tokens": 192690063.0, + "step": 5049 + }, + { + "epoch": 0.6424119068820761, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.830960988998413, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8551217317581177, + "num_tokens": 192732067.0, + "step": 5050 + }, + { + "epoch": 0.6425391171606666, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8500372171401978, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8693522214889526, + "num_tokens": 192768598.0, + "step": 5051 + }, + { + "epoch": 0.6426663274392571, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9988319873809814, + "learning_rate": 1e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.831255316734314, + "num_tokens": 192805420.0, + "step": 5052 + }, + { + "epoch": 0.6427935377178476, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.1295387744903564, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8546053767204285, + "num_tokens": 192852082.0, + "step": 5053 + }, + { + "epoch": 0.6429207479964381, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8901119232177734, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8603659868240356, + "num_tokens": 192892846.0, + "step": 5054 + }, + { + "epoch": 0.6430479582750286, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 3.0960257053375244, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8765358924865723, + "num_tokens": 192932091.0, + "step": 5055 + }, + { + "epoch": 0.6431751685536191, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0587213039398193, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.860818088054657, + "num_tokens": 192960628.0, + "step": 5056 + }, + { + "epoch": 0.6433023788322096, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8334801197052002, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8531961441040039, + "num_tokens": 193002075.0, + "step": 5057 + }, + { + "epoch": 0.6434295891108002, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8541380167007446, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8492452502250671, + "num_tokens": 193039720.0, + "step": 5058 + }, + { + "epoch": 0.6435567993893907, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8559576272964478, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8603734970092773, + "num_tokens": 193072806.0, + "step": 5059 + }, + { + "epoch": 0.6436840096679812, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8621349334716797, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8681447505950928, + "num_tokens": 193108478.0, + "step": 5060 + }, + { + "epoch": 0.6438112199465716, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7367221117019653, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8633551597595215, + "num_tokens": 193150663.0, + "step": 5061 + }, + { + "epoch": 0.6439384302251622, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9039626121520996, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8627021908760071, + "num_tokens": 193191499.0, + "step": 5062 + }, + { + "epoch": 0.6440656405037527, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9101537466049194, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8506189584732056, + "num_tokens": 193236103.0, + "step": 5063 + }, + { + "epoch": 0.6441928507823432, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.041126012802124, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8370795249938965, + "num_tokens": 193274095.0, + "step": 5064 + }, + { + "epoch": 0.6443200610609338, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.6942914724349976, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8653788566589355, + "num_tokens": 193316988.0, + "step": 5065 + }, + { + "epoch": 0.6444472713395243, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 3.022451400756836, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8583873510360718, + "num_tokens": 193357430.0, + "step": 5066 + }, + { + "epoch": 0.6445744816181147, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0532279014587402, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8671557307243347, + "num_tokens": 193390392.0, + "step": 5067 + }, + { + "epoch": 0.6447016918967052, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.0163636207580566, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8564239740371704, + "num_tokens": 193430961.0, + "step": 5068 + }, + { + "epoch": 0.6448289021752958, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8932745456695557, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8571934103965759, + "num_tokens": 193473767.0, + "step": 5069 + }, + { + "epoch": 0.6449561124538863, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.2662336826324463, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8477470278739929, + "num_tokens": 193511375.0, + "step": 5070 + }, + { + "epoch": 0.6450833227324768, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7722164392471313, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8692971467971802, + "num_tokens": 193551296.0, + "step": 5071 + }, + { + "epoch": 0.6452105330110673, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9413601160049438, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8630353212356567, + "num_tokens": 193596484.0, + "step": 5072 + }, + { + "epoch": 0.6453377432896578, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.010258197784424, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8657434582710266, + "num_tokens": 193637513.0, + "step": 5073 + }, + { + "epoch": 0.6454649535682483, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.0776822566986084, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8530145883560181, + "num_tokens": 193672808.0, + "step": 5074 + }, + { + "epoch": 0.6455921638468388, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7926338911056519, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8743863105773926, + "num_tokens": 193713018.0, + "step": 5075 + }, + { + "epoch": 0.6457193741254293, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7606223821640015, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8615201711654663, + "num_tokens": 193757649.0, + "step": 5076 + }, + { + "epoch": 0.6458465844040199, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.046597719192505, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8565561771392822, + "num_tokens": 193792390.0, + "step": 5077 + }, + { + "epoch": 0.6459737946826104, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.351555824279785, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8666257858276367, + "num_tokens": 193827721.0, + "step": 5078 + }, + { + "epoch": 0.6461010049612008, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.965038776397705, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8493192195892334, + "num_tokens": 193867217.0, + "step": 5079 + }, + { + "epoch": 0.6462282152397913, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.0802974700927734, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8663071990013123, + "num_tokens": 193900128.0, + "step": 5080 + }, + { + "epoch": 0.6463554255183819, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.094632625579834, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8697509765625, + "num_tokens": 193935744.0, + "step": 5081 + }, + { + "epoch": 0.6464826357969724, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8652935028076172, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8568959832191467, + "num_tokens": 193977344.0, + "step": 5082 + }, + { + "epoch": 0.6466098460755629, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8817294836044312, + "learning_rate": 1e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.8331959247589111, + "num_tokens": 194014295.0, + "step": 5083 + }, + { + "epoch": 0.6467370563541535, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.809206485748291, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8443882465362549, + "num_tokens": 194055245.0, + "step": 5084 + }, + { + "epoch": 0.6468642666327439, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9171713590621948, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8531150817871094, + "num_tokens": 194089313.0, + "step": 5085 + }, + { + "epoch": 0.6469914769113344, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8243175745010376, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8661916255950928, + "num_tokens": 194128293.0, + "step": 5086 + }, + { + "epoch": 0.6471186871899249, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8700594902038574, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8531951308250427, + "num_tokens": 194166632.0, + "step": 5087 + }, + { + "epoch": 0.6472458974685155, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.836253046989441, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8459475040435791, + "num_tokens": 194209176.0, + "step": 5088 + }, + { + "epoch": 0.647373107747106, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 80.5212631225586, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8482469320297241, + "num_tokens": 194245617.0, + "step": 5089 + }, + { + "epoch": 0.6475003180256965, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.087181329727173, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8653547167778015, + "num_tokens": 194285983.0, + "step": 5090 + }, + { + "epoch": 0.6476275283042869, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.15366530418396, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8557115793228149, + "num_tokens": 194317088.0, + "step": 5091 + }, + { + "epoch": 0.6477547385828775, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8524408340454102, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.864082932472229, + "num_tokens": 194354025.0, + "step": 5092 + }, + { + "epoch": 0.647881948861468, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9883211851119995, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8629443645477295, + "num_tokens": 194388539.0, + "step": 5093 + }, + { + "epoch": 0.6480091591400585, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9203968048095703, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8605847954750061, + "num_tokens": 194424518.0, + "step": 5094 + }, + { + "epoch": 0.648136369418649, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8379795551300049, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8565243482589722, + "num_tokens": 194463310.0, + "step": 5095 + }, + { + "epoch": 0.6482635796972396, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.442858934402466, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8579965829849243, + "num_tokens": 194499911.0, + "step": 5096 + }, + { + "epoch": 0.64839078997583, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9707863330841064, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8499842882156372, + "num_tokens": 194536731.0, + "step": 5097 + }, + { + "epoch": 0.6485180002544205, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9661983251571655, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8483626246452332, + "num_tokens": 194574014.0, + "step": 5098 + }, + { + "epoch": 0.648645210533011, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9210755825042725, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8507957458496094, + "num_tokens": 194611944.0, + "step": 5099 + }, + { + "epoch": 0.6487724208116016, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8584917783737183, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8680358529090881, + "num_tokens": 194649734.0, + "step": 5100 + }, + { + "epoch": 0.6488996310901921, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.86332106590271, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8672765493392944, + "num_tokens": 194688344.0, + "step": 5101 + }, + { + "epoch": 0.6490268413687826, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9031379222869873, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8457517623901367, + "num_tokens": 194728582.0, + "step": 5102 + }, + { + "epoch": 0.649154051647373, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.884833574295044, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8590003252029419, + "num_tokens": 194762753.0, + "step": 5103 + }, + { + "epoch": 0.6492812619259636, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.79353928565979, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8719249963760376, + "num_tokens": 194802325.0, + "step": 5104 + }, + { + "epoch": 0.6494084722045541, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.739145040512085, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8576418161392212, + "num_tokens": 194846349.0, + "step": 5105 + }, + { + "epoch": 0.6495356824831446, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8185259103775024, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8604706525802612, + "num_tokens": 194880334.0, + "step": 5106 + }, + { + "epoch": 0.6496628927617352, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9521578550338745, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.849463701248169, + "num_tokens": 194919343.0, + "step": 5107 + }, + { + "epoch": 0.6497901030403257, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.876166820526123, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8597249984741211, + "num_tokens": 194958800.0, + "step": 5108 + }, + { + "epoch": 0.6499173133189162, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.0614922046661377, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.832712709903717, + "num_tokens": 194997249.0, + "step": 5109 + }, + { + "epoch": 0.6500445235975066, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.83935546875, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.860401451587677, + "num_tokens": 195035879.0, + "step": 5110 + }, + { + "epoch": 0.6501717338760972, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8400537967681885, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8556151390075684, + "num_tokens": 195075925.0, + "step": 5111 + }, + { + "epoch": 0.6502989441546877, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.0538363456726074, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8614368438720703, + "num_tokens": 195110185.0, + "step": 5112 + }, + { + "epoch": 0.6504261544332782, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9060227870941162, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8505762815475464, + "num_tokens": 195154666.0, + "step": 5113 + }, + { + "epoch": 0.6505533647118688, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.934954047203064, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8695809841156006, + "num_tokens": 195188998.0, + "step": 5114 + }, + { + "epoch": 0.6506805749904593, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8625882863998413, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8654999732971191, + "num_tokens": 195224124.0, + "step": 5115 + }, + { + "epoch": 0.6508077852690497, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.2013111114501953, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8596281409263611, + "num_tokens": 195253601.0, + "step": 5116 + }, + { + "epoch": 0.6509349955476402, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9948384761810303, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8508853316307068, + "num_tokens": 195289045.0, + "step": 5117 + }, + { + "epoch": 0.6510622058262308, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.839508056640625, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.860881507396698, + "num_tokens": 195326983.0, + "step": 5118 + }, + { + "epoch": 0.6511894161048213, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.0518100261688232, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8710861802101135, + "num_tokens": 195360425.0, + "step": 5119 + }, + { + "epoch": 0.6513166263834118, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8176624774932861, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8470978736877441, + "num_tokens": 195402648.0, + "step": 5120 + }, + { + "epoch": 0.6514438366620023, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8997050523757935, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8769856691360474, + "num_tokens": 195440945.0, + "step": 5121 + }, + { + "epoch": 0.6515710469405928, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.984344720840454, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8621504902839661, + "num_tokens": 195481872.0, + "step": 5122 + }, + { + "epoch": 0.6516982572191833, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.712904930114746, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8636263608932495, + "num_tokens": 195525460.0, + "step": 5123 + }, + { + "epoch": 0.6518254674977738, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.844489336013794, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8616029620170593, + "num_tokens": 195563522.0, + "step": 5124 + }, + { + "epoch": 0.6519526777763643, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9642670154571533, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8689895272254944, + "num_tokens": 195598114.0, + "step": 5125 + }, + { + "epoch": 0.6520798880549549, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8856853246688843, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8544314503669739, + "num_tokens": 195639826.0, + "step": 5126 + }, + { + "epoch": 0.6522070983335454, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7352745532989502, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8666347861289978, + "num_tokens": 195682047.0, + "step": 5127 + }, + { + "epoch": 0.6523343086121358, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8351476192474365, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8629101514816284, + "num_tokens": 195722453.0, + "step": 5128 + }, + { + "epoch": 0.6524615188907263, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9303841590881348, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8547711372375488, + "num_tokens": 195759182.0, + "step": 5129 + }, + { + "epoch": 0.6525887291693169, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.849793791770935, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.863067090511322, + "num_tokens": 195795641.0, + "step": 5130 + }, + { + "epoch": 0.6527159394479074, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7703803777694702, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.869878888130188, + "num_tokens": 195831200.0, + "step": 5131 + }, + { + "epoch": 0.6528431497264979, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7898691892623901, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8613240718841553, + "num_tokens": 195875110.0, + "step": 5132 + }, + { + "epoch": 0.6529703600050885, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8271756172180176, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8626489639282227, + "num_tokens": 195916724.0, + "step": 5133 + }, + { + "epoch": 0.6530975702836789, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8973257541656494, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8533376455307007, + "num_tokens": 195956219.0, + "step": 5134 + }, + { + "epoch": 0.6532247805622694, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9854185581207275, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.862639844417572, + "num_tokens": 195991563.0, + "step": 5135 + }, + { + "epoch": 0.6533519908408599, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9608538150787354, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8664557933807373, + "num_tokens": 196031864.0, + "step": 5136 + }, + { + "epoch": 0.6534792011194505, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.005277156829834, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8516018390655518, + "num_tokens": 196067208.0, + "step": 5137 + }, + { + "epoch": 0.653606411398041, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8786042928695679, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8634613156318665, + "num_tokens": 196103934.0, + "step": 5138 + }, + { + "epoch": 0.6537336216766315, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9540393352508545, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8555927872657776, + "num_tokens": 196140886.0, + "step": 5139 + }, + { + "epoch": 0.6538608319552219, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9646189212799072, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.868548572063446, + "num_tokens": 196178940.0, + "step": 5140 + }, + { + "epoch": 0.6539880422338125, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8272024393081665, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8616508841514587, + "num_tokens": 196229461.0, + "step": 5141 + }, + { + "epoch": 0.654115252512403, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8829386234283447, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.872044563293457, + "num_tokens": 196265793.0, + "step": 5142 + }, + { + "epoch": 0.6542424627909935, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8502287864685059, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8660807609558105, + "num_tokens": 196304919.0, + "step": 5143 + }, + { + "epoch": 0.654369673069584, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8214293718338013, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8742782473564148, + "num_tokens": 196344761.0, + "step": 5144 + }, + { + "epoch": 0.6544968833481746, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7915269136428833, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8741128444671631, + "num_tokens": 196382625.0, + "step": 5145 + }, + { + "epoch": 0.654624093626765, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8456230163574219, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.871039867401123, + "num_tokens": 196422279.0, + "step": 5146 + }, + { + "epoch": 0.6547513039053555, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.868865966796875, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8567569851875305, + "num_tokens": 196463684.0, + "step": 5147 + }, + { + "epoch": 0.654878514183946, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8381506204605103, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.862445592880249, + "num_tokens": 196506319.0, + "step": 5148 + }, + { + "epoch": 0.6550057244625366, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8173094987869263, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.853725016117096, + "num_tokens": 196546367.0, + "step": 5149 + }, + { + "epoch": 0.6551329347411271, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.232036590576172, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.846428632736206, + "num_tokens": 196576598.0, + "step": 5150 + }, + { + "epoch": 0.6552601450197176, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.0546891689300537, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8534397482872009, + "num_tokens": 196610633.0, + "step": 5151 + }, + { + "epoch": 0.655387355298308, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.981994867324829, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8508497476577759, + "num_tokens": 196646183.0, + "step": 5152 + }, + { + "epoch": 0.6555145655768986, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7783604860305786, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8698410987854004, + "num_tokens": 196689454.0, + "step": 5153 + }, + { + "epoch": 0.6556417758554891, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8624591827392578, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8451516032218933, + "num_tokens": 196731820.0, + "step": 5154 + }, + { + "epoch": 0.6557689861340796, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9981350898742676, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8509646654129028, + "num_tokens": 196768542.0, + "step": 5155 + }, + { + "epoch": 0.6558961964126702, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7459311485290527, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8446763157844543, + "num_tokens": 196810300.0, + "step": 5156 + }, + { + "epoch": 0.6560234066912607, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9133542776107788, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8433593511581421, + "num_tokens": 196847271.0, + "step": 5157 + }, + { + "epoch": 0.6561506169698512, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.931193470954895, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8756647109985352, + "num_tokens": 196880439.0, + "step": 5158 + }, + { + "epoch": 0.6562778272484416, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.956528663635254, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8532894849777222, + "num_tokens": 196914911.0, + "step": 5159 + }, + { + "epoch": 0.6564050375270322, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.0022966861724854, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8594528436660767, + "num_tokens": 196949419.0, + "step": 5160 + }, + { + "epoch": 0.6565322478056227, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.841193437576294, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8527565002441406, + "num_tokens": 196989401.0, + "step": 5161 + }, + { + "epoch": 0.6566594580842132, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8254367113113403, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8523597717285156, + "num_tokens": 197028377.0, + "step": 5162 + }, + { + "epoch": 0.6567866683628037, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9315851926803589, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8394240140914917, + "num_tokens": 197068334.0, + "step": 5163 + }, + { + "epoch": 0.6569138786413943, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9084779024124146, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8620725274085999, + "num_tokens": 197105978.0, + "step": 5164 + }, + { + "epoch": 0.6570410889199847, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7709088325500488, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8422077894210815, + "num_tokens": 197148641.0, + "step": 5165 + }, + { + "epoch": 0.6571682991985752, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.8117624521255493, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.840530276298523, + "num_tokens": 197188820.0, + "step": 5166 + }, + { + "epoch": 0.6572955094771658, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7976722717285156, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8470913171768188, + "num_tokens": 197227109.0, + "step": 5167 + }, + { + "epoch": 0.6574227197557563, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7738614082336426, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8653312921524048, + "num_tokens": 197264083.0, + "step": 5168 + }, + { + "epoch": 0.6575499300343468, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9361605644226074, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8545860052108765, + "num_tokens": 197301349.0, + "step": 5169 + }, + { + "epoch": 0.6576771403129373, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.73561692237854, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8485978245735168, + "num_tokens": 197348176.0, + "step": 5170 + }, + { + "epoch": 0.6578043505915278, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9394587278366089, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8516931533813477, + "num_tokens": 197380490.0, + "step": 5171 + }, + { + "epoch": 0.6579315608701183, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8814977407455444, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8460429906845093, + "num_tokens": 197423075.0, + "step": 5172 + }, + { + "epoch": 0.6580587711487088, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.1747190952301025, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8622629642486572, + "num_tokens": 197461730.0, + "step": 5173 + }, + { + "epoch": 0.6581859814272993, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7690709829330444, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8490219116210938, + "num_tokens": 197505801.0, + "step": 5174 + }, + { + "epoch": 0.6583131917058899, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.0105340480804443, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.875964343547821, + "num_tokens": 197534643.0, + "step": 5175 + }, + { + "epoch": 0.6584404019844804, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8106298446655273, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8658149838447571, + "num_tokens": 197576490.0, + "step": 5176 + }, + { + "epoch": 0.6585676122630708, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8990501165390015, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.873695969581604, + "num_tokens": 197612745.0, + "step": 5177 + }, + { + "epoch": 0.6586948225416613, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.912453055381775, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8544973134994507, + "num_tokens": 197653756.0, + "step": 5178 + }, + { + "epoch": 0.6588220328202519, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8820492029190063, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.884223222732544, + "num_tokens": 197694417.0, + "step": 5179 + }, + { + "epoch": 0.6589492430988424, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0902326107025146, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8548291325569153, + "num_tokens": 197732738.0, + "step": 5180 + }, + { + "epoch": 0.6590764533774329, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.143808126449585, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8514722585678101, + "num_tokens": 197759408.0, + "step": 5181 + }, + { + "epoch": 0.6592036636560235, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8880544900894165, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8616321682929993, + "num_tokens": 197794334.0, + "step": 5182 + }, + { + "epoch": 0.6593308739346139, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8628535270690918, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8687664270401001, + "num_tokens": 197832497.0, + "step": 5183 + }, + { + "epoch": 0.6594580842132044, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.1029746532440186, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8505744934082031, + "num_tokens": 197869021.0, + "step": 5184 + }, + { + "epoch": 0.6595852944917949, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0853559970855713, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8446807265281677, + "num_tokens": 197906365.0, + "step": 5185 + }, + { + "epoch": 0.6597125047703855, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.6893539428710938, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.861581563949585, + "num_tokens": 197950725.0, + "step": 5186 + }, + { + "epoch": 0.659839715048976, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9948490858078003, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8668485879898071, + "num_tokens": 197991178.0, + "step": 5187 + }, + { + "epoch": 0.6599669253275665, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7976359128952026, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.850036084651947, + "num_tokens": 198030176.0, + "step": 5188 + }, + { + "epoch": 0.6600941356061569, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8572224378585815, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8540059328079224, + "num_tokens": 198067169.0, + "step": 5189 + }, + { + "epoch": 0.6602213458847475, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.80327570438385, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8491432666778564, + "num_tokens": 198104994.0, + "step": 5190 + }, + { + "epoch": 0.660348556163338, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8769350051879883, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8631592988967896, + "num_tokens": 198142791.0, + "step": 5191 + }, + { + "epoch": 0.6604757664419285, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9465198516845703, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8554856777191162, + "num_tokens": 198179300.0, + "step": 5192 + }, + { + "epoch": 0.660602976720519, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9125291109085083, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8728989362716675, + "num_tokens": 198220230.0, + "step": 5193 + }, + { + "epoch": 0.6607301869991096, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.207864761352539, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.846899151802063, + "num_tokens": 198251067.0, + "step": 5194 + }, + { + "epoch": 0.6608573972777, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.4886372089385986, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8669809699058533, + "num_tokens": 198282130.0, + "step": 5195 + }, + { + "epoch": 0.6609846075562905, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8758090734481812, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8687601685523987, + "num_tokens": 198325609.0, + "step": 5196 + }, + { + "epoch": 0.661111817834881, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9368749856948853, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8629941940307617, + "num_tokens": 198365116.0, + "step": 5197 + }, + { + "epoch": 0.6612390281134716, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7963273525238037, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8596484661102295, + "num_tokens": 198403467.0, + "step": 5198 + }, + { + "epoch": 0.6613662383920621, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.896136999130249, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8655225038528442, + "num_tokens": 198440676.0, + "step": 5199 + }, + { + "epoch": 0.6614934486706526, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7484737634658813, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8610230684280396, + "num_tokens": 198479082.0, + "step": 5200 + }, + { + "epoch": 0.661620658949243, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8100756406784058, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8553996086120605, + "num_tokens": 198521741.0, + "step": 5201 + }, + { + "epoch": 0.6617478692278336, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7447162866592407, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.852317214012146, + "num_tokens": 198562177.0, + "step": 5202 + }, + { + "epoch": 0.6618750795064241, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8626383543014526, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8600289821624756, + "num_tokens": 198600726.0, + "step": 5203 + }, + { + "epoch": 0.6620022897850146, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9554574489593506, + "learning_rate": 1e-06, + "loss": 0.5375, + "mean_token_accuracy": 0.8307654857635498, + "num_tokens": 198638156.0, + "step": 5204 + }, + { + "epoch": 0.6621295000636052, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.304922342300415, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8422375321388245, + "num_tokens": 198680312.0, + "step": 5205 + }, + { + "epoch": 0.6622567103421957, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9243099689483643, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8774446249008179, + "num_tokens": 198723886.0, + "step": 5206 + }, + { + "epoch": 0.6623839206207861, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8892507553100586, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8591468334197998, + "num_tokens": 198765331.0, + "step": 5207 + }, + { + "epoch": 0.6625111308993766, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.91285240650177, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8636866807937622, + "num_tokens": 198802421.0, + "step": 5208 + }, + { + "epoch": 0.6626383411779672, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7333263158798218, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.869138240814209, + "num_tokens": 198843335.0, + "step": 5209 + }, + { + "epoch": 0.6627655514565577, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.813225269317627, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8665331602096558, + "num_tokens": 198880925.0, + "step": 5210 + }, + { + "epoch": 0.6628927617351482, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.016726493835449, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8588138818740845, + "num_tokens": 198915448.0, + "step": 5211 + }, + { + "epoch": 0.6630199720137387, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7813310623168945, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.869413435459137, + "num_tokens": 198952254.0, + "step": 5212 + }, + { + "epoch": 0.6631471822923293, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9906845092773438, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8636361360549927, + "num_tokens": 198990841.0, + "step": 5213 + }, + { + "epoch": 0.6632743925709197, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8061164617538452, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8618944883346558, + "num_tokens": 199029383.0, + "step": 5214 + }, + { + "epoch": 0.6634016028495102, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.04776668548584, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8593620657920837, + "num_tokens": 199066731.0, + "step": 5215 + }, + { + "epoch": 0.6635288131281007, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8432440757751465, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8720645308494568, + "num_tokens": 199108752.0, + "step": 5216 + }, + { + "epoch": 0.6636560234066913, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.007235288619995, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8630502223968506, + "num_tokens": 199146438.0, + "step": 5217 + }, + { + "epoch": 0.6637832336852818, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.995518445968628, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8618557453155518, + "num_tokens": 199186367.0, + "step": 5218 + }, + { + "epoch": 0.6639104439638723, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8322203159332275, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8720929622650146, + "num_tokens": 199225785.0, + "step": 5219 + }, + { + "epoch": 0.6640376542424627, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8879499435424805, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8627668619155884, + "num_tokens": 199262574.0, + "step": 5220 + }, + { + "epoch": 0.6641648645210533, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8185882568359375, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8639037609100342, + "num_tokens": 199299093.0, + "step": 5221 + }, + { + "epoch": 0.6642920747996438, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8703161478042603, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8651409149169922, + "num_tokens": 199337446.0, + "step": 5222 + }, + { + "epoch": 0.6644192850782343, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8221553564071655, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8619551658630371, + "num_tokens": 199377321.0, + "step": 5223 + }, + { + "epoch": 0.6645464953568249, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9476847648620605, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8706979751586914, + "num_tokens": 199413057.0, + "step": 5224 + }, + { + "epoch": 0.6646737056354154, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9191840887069702, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8668637871742249, + "num_tokens": 199452730.0, + "step": 5225 + }, + { + "epoch": 0.6648009159140058, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8558979034423828, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8707509636878967, + "num_tokens": 199487860.0, + "step": 5226 + }, + { + "epoch": 0.6649281261925963, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.818455457687378, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8679295778274536, + "num_tokens": 199532790.0, + "step": 5227 + }, + { + "epoch": 0.6650553364711869, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.962397575378418, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.879400908946991, + "num_tokens": 199561072.0, + "step": 5228 + }, + { + "epoch": 0.6651825467497774, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 16.606897354125977, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8653716444969177, + "num_tokens": 199595820.0, + "step": 5229 + }, + { + "epoch": 0.6653097570283679, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9571017026901245, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8479329943656921, + "num_tokens": 199640101.0, + "step": 5230 + }, + { + "epoch": 0.6654369673069584, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8980110883712769, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8557233214378357, + "num_tokens": 199680287.0, + "step": 5231 + }, + { + "epoch": 0.6655641775855489, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8490064144134521, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8596426248550415, + "num_tokens": 199721958.0, + "step": 5232 + }, + { + "epoch": 0.6656913878641394, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9561572074890137, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8653484582901001, + "num_tokens": 199756021.0, + "step": 5233 + }, + { + "epoch": 0.6658185981427299, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7494670152664185, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8648471832275391, + "num_tokens": 199793586.0, + "step": 5234 + }, + { + "epoch": 0.6659458084213205, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7229772806167603, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8472082614898682, + "num_tokens": 199835728.0, + "step": 5235 + }, + { + "epoch": 0.666073018699911, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7923723459243774, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8695845603942871, + "num_tokens": 199873348.0, + "step": 5236 + }, + { + "epoch": 0.6662002289785015, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.891920566558838, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8600159883499146, + "num_tokens": 199908223.0, + "step": 5237 + }, + { + "epoch": 0.6663274392570919, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.7998660802841187, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8572070598602295, + "num_tokens": 199947017.0, + "step": 5238 + }, + { + "epoch": 0.6664546495356825, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9146634340286255, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.867769718170166, + "num_tokens": 199984806.0, + "step": 5239 + }, + { + "epoch": 0.666581859814273, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.029771327972412, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8537511825561523, + "num_tokens": 200022600.0, + "step": 5240 + }, + { + "epoch": 0.6667090700928635, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9854769706726074, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8692308664321899, + "num_tokens": 200058407.0, + "step": 5241 + }, + { + "epoch": 0.666836280371454, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.009225368499756, + "learning_rate": 1e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8342102766036987, + "num_tokens": 200101871.0, + "step": 5242 + }, + { + "epoch": 0.6669634906500446, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.9792296886444092, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8608089685440063, + "num_tokens": 200138501.0, + "step": 5243 + }, + { + "epoch": 0.667090700928635, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.962073564529419, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8590669631958008, + "num_tokens": 200176639.0, + "step": 5244 + }, + { + "epoch": 0.6672179112072255, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.026519775390625, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8548868298530579, + "num_tokens": 200211873.0, + "step": 5245 + }, + { + "epoch": 0.667345121485816, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9183845520019531, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8560280203819275, + "num_tokens": 200246596.0, + "step": 5246 + }, + { + "epoch": 0.6674723317644066, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8458842039108276, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8758827447891235, + "num_tokens": 200282761.0, + "step": 5247 + }, + { + "epoch": 0.6675995420429971, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9040472507476807, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8458893299102783, + "num_tokens": 200321821.0, + "step": 5248 + }, + { + "epoch": 0.6677267523215876, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9972621202468872, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8445921540260315, + "num_tokens": 200369066.0, + "step": 5249 + }, + { + "epoch": 0.667853962600178, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9318037033081055, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.85295569896698, + "num_tokens": 200408067.0, + "step": 5250 + }, + { + "epoch": 0.6679811728787686, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.770946741104126, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8778060078620911, + "num_tokens": 200446693.0, + "step": 5251 + }, + { + "epoch": 0.6681083831573591, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9552934169769287, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8419995903968811, + "num_tokens": 200485340.0, + "step": 5252 + }, + { + "epoch": 0.6682355934359496, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9592478275299072, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8556408882141113, + "num_tokens": 200521978.0, + "step": 5253 + }, + { + "epoch": 0.6683628037145402, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8962278366088867, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8605614900588989, + "num_tokens": 200560604.0, + "step": 5254 + }, + { + "epoch": 0.6684900139931307, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8168901205062866, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8567087650299072, + "num_tokens": 200601718.0, + "step": 5255 + }, + { + "epoch": 0.6686172242717211, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.6931438446044922, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8350767493247986, + "num_tokens": 200646186.0, + "step": 5256 + }, + { + "epoch": 0.6687444345503116, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.5591509342193604, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8616195917129517, + "num_tokens": 200684849.0, + "step": 5257 + }, + { + "epoch": 0.6688716448289022, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.990120768547058, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8660733103752136, + "num_tokens": 200719542.0, + "step": 5258 + }, + { + "epoch": 0.6689988551074927, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7764688730239868, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8620529174804688, + "num_tokens": 200756212.0, + "step": 5259 + }, + { + "epoch": 0.6691260653860832, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8773396015167236, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8633127212524414, + "num_tokens": 200791184.0, + "step": 5260 + }, + { + "epoch": 0.6692532756646737, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.052450656890869, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8439382910728455, + "num_tokens": 200823966.0, + "step": 5261 + }, + { + "epoch": 0.6693804859432643, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8902263641357422, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8636716604232788, + "num_tokens": 200862718.0, + "step": 5262 + }, + { + "epoch": 0.6695076962218547, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7584136724472046, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8557132482528687, + "num_tokens": 200901755.0, + "step": 5263 + }, + { + "epoch": 0.6696349065004452, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8135011196136475, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8726067543029785, + "num_tokens": 200935788.0, + "step": 5264 + }, + { + "epoch": 0.6697621167790357, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.949027180671692, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8737022876739502, + "num_tokens": 200975159.0, + "step": 5265 + }, + { + "epoch": 0.6698893270576263, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8210557699203491, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8724272847175598, + "num_tokens": 201010813.0, + "step": 5266 + }, + { + "epoch": 0.6700165373362168, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.974361538887024, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.858931303024292, + "num_tokens": 201051937.0, + "step": 5267 + }, + { + "epoch": 0.6701437476148073, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8927042484283447, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8671138882637024, + "num_tokens": 201090926.0, + "step": 5268 + }, + { + "epoch": 0.6702709578933977, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.267904043197632, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8407890200614929, + "num_tokens": 201126086.0, + "step": 5269 + }, + { + "epoch": 0.6703981681719883, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9228142499923706, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8501707315444946, + "num_tokens": 201165933.0, + "step": 5270 + }, + { + "epoch": 0.6705253784505788, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.949866771697998, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8646174669265747, + "num_tokens": 201198794.0, + "step": 5271 + }, + { + "epoch": 0.6706525887291693, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.000793218612671, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8511703014373779, + "num_tokens": 201234081.0, + "step": 5272 + }, + { + "epoch": 0.6707797990077599, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.859924077987671, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.857040286064148, + "num_tokens": 201271796.0, + "step": 5273 + }, + { + "epoch": 0.6709070092863504, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8492852449417114, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8413601517677307, + "num_tokens": 201309526.0, + "step": 5274 + }, + { + "epoch": 0.6710342195649408, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8481031656265259, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8667196035385132, + "num_tokens": 201344015.0, + "step": 5275 + }, + { + "epoch": 0.6711614298435313, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.2196505069732666, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8605218529701233, + "num_tokens": 201385201.0, + "step": 5276 + }, + { + "epoch": 0.6712886401221219, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8803266286849976, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8653548955917358, + "num_tokens": 201422370.0, + "step": 5277 + }, + { + "epoch": 0.6714158504007124, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7789416313171387, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8447739481925964, + "num_tokens": 201467085.0, + "step": 5278 + }, + { + "epoch": 0.6715430606793029, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0667548179626465, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8733522891998291, + "num_tokens": 201503432.0, + "step": 5279 + }, + { + "epoch": 0.6716702709578934, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 2.017559766769409, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8667020797729492, + "num_tokens": 201541358.0, + "step": 5280 + }, + { + "epoch": 0.6717974812364839, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 2.0057685375213623, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8561292886734009, + "num_tokens": 201580235.0, + "step": 5281 + }, + { + "epoch": 0.6719246915150744, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7781260013580322, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8599295616149902, + "num_tokens": 201627703.0, + "step": 5282 + }, + { + "epoch": 0.6720519017936649, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7884290218353271, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8630778193473816, + "num_tokens": 201667182.0, + "step": 5283 + }, + { + "epoch": 0.6721791120722554, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 2.022937536239624, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8614250421524048, + "num_tokens": 201699876.0, + "step": 5284 + }, + { + "epoch": 0.672306322350846, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.1713457107543945, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.850832998752594, + "num_tokens": 201732770.0, + "step": 5285 + }, + { + "epoch": 0.6724335326294365, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8563156127929688, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8621861934661865, + "num_tokens": 201771388.0, + "step": 5286 + }, + { + "epoch": 0.6725607429080269, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9091852903366089, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8551454544067383, + "num_tokens": 201809283.0, + "step": 5287 + }, + { + "epoch": 0.6726879531866174, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7488576173782349, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.866377592086792, + "num_tokens": 201854950.0, + "step": 5288 + }, + { + "epoch": 0.672815163465208, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7764757871627808, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8642467856407166, + "num_tokens": 201895638.0, + "step": 5289 + }, + { + "epoch": 0.6729423737437985, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0699167251586914, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8468933701515198, + "num_tokens": 201928243.0, + "step": 5290 + }, + { + "epoch": 0.673069584022389, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9425225257873535, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8603246212005615, + "num_tokens": 201964687.0, + "step": 5291 + }, + { + "epoch": 0.6731967943009796, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8700029850006104, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8729033470153809, + "num_tokens": 201997166.0, + "step": 5292 + }, + { + "epoch": 0.67332400457957, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9425413608551025, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8621541261672974, + "num_tokens": 202030771.0, + "step": 5293 + }, + { + "epoch": 0.6734512148581605, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8991835117340088, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8526059985160828, + "num_tokens": 202077280.0, + "step": 5294 + }, + { + "epoch": 0.673578425136751, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8779659271240234, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8516834378242493, + "num_tokens": 202115273.0, + "step": 5295 + }, + { + "epoch": 0.6737056354153416, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9384821653366089, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.870672881603241, + "num_tokens": 202145347.0, + "step": 5296 + }, + { + "epoch": 0.6738328456939321, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.976994276046753, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8639363050460815, + "num_tokens": 202184621.0, + "step": 5297 + }, + { + "epoch": 0.6739600559725226, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7816261053085327, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.862856924533844, + "num_tokens": 202230099.0, + "step": 5298 + }, + { + "epoch": 0.674087266251113, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9003562927246094, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8534407019615173, + "num_tokens": 202267872.0, + "step": 5299 + }, + { + "epoch": 0.6742144765297036, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9455480575561523, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8584600687026978, + "num_tokens": 202305517.0, + "step": 5300 + }, + { + "epoch": 0.6743416868082941, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9663724899291992, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8455213904380798, + "num_tokens": 202340671.0, + "step": 5301 + }, + { + "epoch": 0.6744688970868846, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8983889818191528, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8678959012031555, + "num_tokens": 202379552.0, + "step": 5302 + }, + { + "epoch": 0.6745961073654752, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8099501132965088, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8547091484069824, + "num_tokens": 202421404.0, + "step": 5303 + }, + { + "epoch": 0.6747233176440657, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8214513063430786, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8463184833526611, + "num_tokens": 202463765.0, + "step": 5304 + }, + { + "epoch": 0.6748505279226561, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.756636381149292, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8551272749900818, + "num_tokens": 202505872.0, + "step": 5305 + }, + { + "epoch": 0.6749777382012466, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8572626113891602, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8582456111907959, + "num_tokens": 202544085.0, + "step": 5306 + }, + { + "epoch": 0.6751049484798372, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7808489799499512, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8563918471336365, + "num_tokens": 202588237.0, + "step": 5307 + }, + { + "epoch": 0.6752321587584277, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8117142915725708, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8528244495391846, + "num_tokens": 202628462.0, + "step": 5308 + }, + { + "epoch": 0.6753593690370182, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7773329019546509, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.861913800239563, + "num_tokens": 202668149.0, + "step": 5309 + }, + { + "epoch": 0.6754865793156087, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7752974033355713, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8745112419128418, + "num_tokens": 202708651.0, + "step": 5310 + }, + { + "epoch": 0.6756137895941993, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 2.0890111923217773, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8655325174331665, + "num_tokens": 202738613.0, + "step": 5311 + }, + { + "epoch": 0.6757409998727897, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9274020195007324, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8557157516479492, + "num_tokens": 202779279.0, + "step": 5312 + }, + { + "epoch": 0.6758682101513802, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9550426006317139, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8604006767272949, + "num_tokens": 202812095.0, + "step": 5313 + }, + { + "epoch": 0.6759954204299707, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.80277419090271, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8550413846969604, + "num_tokens": 202855705.0, + "step": 5314 + }, + { + "epoch": 0.6761226307085613, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9100390672683716, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8437176942825317, + "num_tokens": 202894180.0, + "step": 5315 + }, + { + "epoch": 0.6762498409871518, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.007758378982544, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.866998553276062, + "num_tokens": 202925842.0, + "step": 5316 + }, + { + "epoch": 0.6763770512657423, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.908474326133728, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8655152320861816, + "num_tokens": 202959103.0, + "step": 5317 + }, + { + "epoch": 0.6765042615443327, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7997621297836304, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8593884110450745, + "num_tokens": 203000393.0, + "step": 5318 + }, + { + "epoch": 0.6766314718229233, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9999898672103882, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8610767126083374, + "num_tokens": 203041613.0, + "step": 5319 + }, + { + "epoch": 0.6767586821015138, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9185206890106201, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8745361566543579, + "num_tokens": 203073744.0, + "step": 5320 + }, + { + "epoch": 0.6768858923801043, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9026987552642822, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8448092937469482, + "num_tokens": 203108350.0, + "step": 5321 + }, + { + "epoch": 0.6770131026586949, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 2.004871129989624, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8634531497955322, + "num_tokens": 203139149.0, + "step": 5322 + }, + { + "epoch": 0.6771403129372854, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9853392839431763, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8432801961898804, + "num_tokens": 203173953.0, + "step": 5323 + }, + { + "epoch": 0.6772675232158758, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 2.0341885089874268, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8700990676879883, + "num_tokens": 203208915.0, + "step": 5324 + }, + { + "epoch": 0.6773947334944663, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.838862657546997, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.847278356552124, + "num_tokens": 203251906.0, + "step": 5325 + }, + { + "epoch": 0.6775219437730569, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9730019569396973, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8600199818611145, + "num_tokens": 203289460.0, + "step": 5326 + }, + { + "epoch": 0.6776491540516474, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.8750455379486084, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8686943650245667, + "num_tokens": 203325647.0, + "step": 5327 + }, + { + "epoch": 0.6777763643302379, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9129096269607544, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.855179488658905, + "num_tokens": 203364972.0, + "step": 5328 + }, + { + "epoch": 0.6779035746088284, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.8405370712280273, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.859836220741272, + "num_tokens": 203407778.0, + "step": 5329 + }, + { + "epoch": 0.6780307848874189, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.8620644807815552, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8598499298095703, + "num_tokens": 203443820.0, + "step": 5330 + }, + { + "epoch": 0.6781579951660094, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9423694610595703, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8647423982620239, + "num_tokens": 203475742.0, + "step": 5331 + }, + { + "epoch": 0.6782852054445999, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 2.14009690284729, + "learning_rate": 1e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8348032236099243, + "num_tokens": 203512617.0, + "step": 5332 + }, + { + "epoch": 0.6784124157231904, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9775432348251343, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8447002172470093, + "num_tokens": 203546778.0, + "step": 5333 + }, + { + "epoch": 0.678539626001781, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.923392653465271, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8618091940879822, + "num_tokens": 203585222.0, + "step": 5334 + }, + { + "epoch": 0.6786668362803715, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.778774380683899, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8534480333328247, + "num_tokens": 203627014.0, + "step": 5335 + }, + { + "epoch": 0.6787940465589619, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9453022480010986, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8657039403915405, + "num_tokens": 203659189.0, + "step": 5336 + }, + { + "epoch": 0.6789212568375524, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9543535709381104, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8703042268753052, + "num_tokens": 203695355.0, + "step": 5337 + }, + { + "epoch": 0.679048467116143, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.7858816385269165, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8691376447677612, + "num_tokens": 203733446.0, + "step": 5338 + }, + { + "epoch": 0.6791756773947335, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.801963210105896, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8425849676132202, + "num_tokens": 203775387.0, + "step": 5339 + }, + { + "epoch": 0.679302887673324, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.944311261177063, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.855134904384613, + "num_tokens": 203810713.0, + "step": 5340 + }, + { + "epoch": 0.6794300979519146, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.904542088508606, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8716235756874084, + "num_tokens": 203847144.0, + "step": 5341 + }, + { + "epoch": 0.679557308230505, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.922856092453003, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8656360507011414, + "num_tokens": 203889716.0, + "step": 5342 + }, + { + "epoch": 0.6796845185090955, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.921355128288269, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8459542393684387, + "num_tokens": 203927692.0, + "step": 5343 + }, + { + "epoch": 0.679811728787686, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.042356014251709, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.847461462020874, + "num_tokens": 203960180.0, + "step": 5344 + }, + { + "epoch": 0.6799389390662766, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8224008083343506, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8718462586402893, + "num_tokens": 203997375.0, + "step": 5345 + }, + { + "epoch": 0.6800661493448671, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9866352081298828, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8559316992759705, + "num_tokens": 204037588.0, + "step": 5346 + }, + { + "epoch": 0.6801933596234576, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7615538835525513, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.852852463722229, + "num_tokens": 204080993.0, + "step": 5347 + }, + { + "epoch": 0.680320569902048, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9556875228881836, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8619680404663086, + "num_tokens": 204116863.0, + "step": 5348 + }, + { + "epoch": 0.6804477801806386, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.827478051185608, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8546414971351624, + "num_tokens": 204156551.0, + "step": 5349 + }, + { + "epoch": 0.6805749904592291, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7582690715789795, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8538205623626709, + "num_tokens": 204196210.0, + "step": 5350 + }, + { + "epoch": 0.6807022007378196, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.860977053642273, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8579530715942383, + "num_tokens": 204234791.0, + "step": 5351 + }, + { + "epoch": 0.6808294110164101, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9575563669204712, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8645510077476501, + "num_tokens": 204271631.0, + "step": 5352 + }, + { + "epoch": 0.6809566212950007, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.783036708831787, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8697118163108826, + "num_tokens": 204311879.0, + "step": 5353 + }, + { + "epoch": 0.6810838315735911, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7852376699447632, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.862422525882721, + "num_tokens": 204357123.0, + "step": 5354 + }, + { + "epoch": 0.6812110418521816, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9483920335769653, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.858090877532959, + "num_tokens": 204397196.0, + "step": 5355 + }, + { + "epoch": 0.6813382521307721, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.920469045639038, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8505376577377319, + "num_tokens": 204434654.0, + "step": 5356 + }, + { + "epoch": 0.6814654624093627, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.0216081142425537, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.882229208946228, + "num_tokens": 204466581.0, + "step": 5357 + }, + { + "epoch": 0.6815926726879532, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9737552404403687, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8724034428596497, + "num_tokens": 204503890.0, + "step": 5358 + }, + { + "epoch": 0.6817198829665437, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.2414417266845703, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8552522659301758, + "num_tokens": 204537160.0, + "step": 5359 + }, + { + "epoch": 0.6818470932451343, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9622225761413574, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8695179224014282, + "num_tokens": 204570077.0, + "step": 5360 + }, + { + "epoch": 0.6819743035237247, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8552731275558472, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.875584602355957, + "num_tokens": 204603638.0, + "step": 5361 + }, + { + "epoch": 0.6821015138023152, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7635083198547363, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8562620878219604, + "num_tokens": 204644584.0, + "step": 5362 + }, + { + "epoch": 0.6822287240809057, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0148768424987793, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8464491367340088, + "num_tokens": 204678459.0, + "step": 5363 + }, + { + "epoch": 0.6823559343594963, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9155070781707764, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8561917543411255, + "num_tokens": 204712923.0, + "step": 5364 + }, + { + "epoch": 0.6824831446380868, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.8954347372055054, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8417077660560608, + "num_tokens": 204751303.0, + "step": 5365 + }, + { + "epoch": 0.6826103549166773, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8083704710006714, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.859795331954956, + "num_tokens": 204791685.0, + "step": 5366 + }, + { + "epoch": 0.6827375651952677, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9136067628860474, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8600598573684692, + "num_tokens": 204827748.0, + "step": 5367 + }, + { + "epoch": 0.6828647754738583, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7857966423034668, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8764008283615112, + "num_tokens": 204866568.0, + "step": 5368 + }, + { + "epoch": 0.6829919857524488, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.843388319015503, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8579376935958862, + "num_tokens": 204908334.0, + "step": 5369 + }, + { + "epoch": 0.6831191960310393, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9058905839920044, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.86248379945755, + "num_tokens": 204949929.0, + "step": 5370 + }, + { + "epoch": 0.6832464063096299, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9035087823867798, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8508294820785522, + "num_tokens": 204991435.0, + "step": 5371 + }, + { + "epoch": 0.6833736165882204, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.949537754058838, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8662071228027344, + "num_tokens": 205028434.0, + "step": 5372 + }, + { + "epoch": 0.6835008268668108, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9740550518035889, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8542677760124207, + "num_tokens": 205064138.0, + "step": 5373 + }, + { + "epoch": 0.6836280371454013, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7955437898635864, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8534724712371826, + "num_tokens": 205104058.0, + "step": 5374 + }, + { + "epoch": 0.6837552474239919, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8858834505081177, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8592530488967896, + "num_tokens": 205142609.0, + "step": 5375 + }, + { + "epoch": 0.6838824577025824, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0596060752868652, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8690158128738403, + "num_tokens": 205180407.0, + "step": 5376 + }, + { + "epoch": 0.6840096679811729, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9248335361480713, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8810562491416931, + "num_tokens": 205217340.0, + "step": 5377 + }, + { + "epoch": 0.6841368782597634, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 2.0670266151428223, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8475567102432251, + "num_tokens": 205263671.0, + "step": 5378 + }, + { + "epoch": 0.6842640885383539, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.789782166481018, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8567026257514954, + "num_tokens": 205302970.0, + "step": 5379 + }, + { + "epoch": 0.6843912988169444, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9480206966400146, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8633272647857666, + "num_tokens": 205342247.0, + "step": 5380 + }, + { + "epoch": 0.6845185090955349, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0382416248321533, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8456292152404785, + "num_tokens": 205374897.0, + "step": 5381 + }, + { + "epoch": 0.6846457193741254, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8319557905197144, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8563140034675598, + "num_tokens": 205411645.0, + "step": 5382 + }, + { + "epoch": 0.684772929652716, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.002317428588867, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8598451018333435, + "num_tokens": 205450972.0, + "step": 5383 + }, + { + "epoch": 0.6849001399313065, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8059897422790527, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8614523410797119, + "num_tokens": 205494314.0, + "step": 5384 + }, + { + "epoch": 0.6850273502098969, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7595548629760742, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8480421900749207, + "num_tokens": 205534696.0, + "step": 5385 + }, + { + "epoch": 0.6851545604884874, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.7390426397323608, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8716007471084595, + "num_tokens": 205575579.0, + "step": 5386 + }, + { + "epoch": 0.685281770767078, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7794326543807983, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8636102080345154, + "num_tokens": 205609249.0, + "step": 5387 + }, + { + "epoch": 0.6854089810456685, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8698123693466187, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8567644953727722, + "num_tokens": 205644135.0, + "step": 5388 + }, + { + "epoch": 0.685536191324259, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.9419881105422974, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8529931902885437, + "num_tokens": 205685312.0, + "step": 5389 + }, + { + "epoch": 0.6856634016028496, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8809504508972168, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8530333638191223, + "num_tokens": 205720525.0, + "step": 5390 + }, + { + "epoch": 0.68579061188144, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8248329162597656, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8403441309928894, + "num_tokens": 205763358.0, + "step": 5391 + }, + { + "epoch": 0.6859178221600305, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8898063898086548, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.862345814704895, + "num_tokens": 205801413.0, + "step": 5392 + }, + { + "epoch": 0.686045032438621, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.048452615737915, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8614664077758789, + "num_tokens": 205840442.0, + "step": 5393 + }, + { + "epoch": 0.6861722427172116, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8673065900802612, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8726521134376526, + "num_tokens": 205877319.0, + "step": 5394 + }, + { + "epoch": 0.6862994529958021, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.979800820350647, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8574100136756897, + "num_tokens": 205916186.0, + "step": 5395 + }, + { + "epoch": 0.6864266632743926, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9010800123214722, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8485517501831055, + "num_tokens": 205956336.0, + "step": 5396 + }, + { + "epoch": 0.686553873552983, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9430543184280396, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8582772612571716, + "num_tokens": 205989570.0, + "step": 5397 + }, + { + "epoch": 0.6866810838315736, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9236212968826294, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8515582084655762, + "num_tokens": 206026139.0, + "step": 5398 + }, + { + "epoch": 0.6868082941101641, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.238907814025879, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8550900220870972, + "num_tokens": 206070059.0, + "step": 5399 + }, + { + "epoch": 0.6869355043887546, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9878506660461426, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8636397123336792, + "num_tokens": 206104645.0, + "step": 5400 + }, + { + "epoch": 0.6870627146673451, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8055704832077026, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8566690683364868, + "num_tokens": 206141220.0, + "step": 5401 + }, + { + "epoch": 0.6871899249459357, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8992135524749756, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8561056852340698, + "num_tokens": 206177917.0, + "step": 5402 + }, + { + "epoch": 0.6873171352245261, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8949497938156128, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8577514290809631, + "num_tokens": 206213419.0, + "step": 5403 + }, + { + "epoch": 0.6874443455031166, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.867386817932129, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8437744379043579, + "num_tokens": 206252421.0, + "step": 5404 + }, + { + "epoch": 0.6875715557817071, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8706601858139038, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8640198707580566, + "num_tokens": 206287195.0, + "step": 5405 + }, + { + "epoch": 0.6876987660602977, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9487882852554321, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8397415280342102, + "num_tokens": 206328339.0, + "step": 5406 + }, + { + "epoch": 0.6878259763388882, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9113554954528809, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8439532518386841, + "num_tokens": 206369523.0, + "step": 5407 + }, + { + "epoch": 0.6879531866174787, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.3058600425720215, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8785922527313232, + "num_tokens": 206402864.0, + "step": 5408 + }, + { + "epoch": 0.6880803968960693, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9879150390625, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8483712077140808, + "num_tokens": 206439004.0, + "step": 5409 + }, + { + "epoch": 0.6882076071746597, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.018683671951294, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8614015579223633, + "num_tokens": 206472320.0, + "step": 5410 + }, + { + "epoch": 0.6883348174532502, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9303855895996094, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8500810265541077, + "num_tokens": 206518649.0, + "step": 5411 + }, + { + "epoch": 0.6884620277318407, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9665277004241943, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8746167421340942, + "num_tokens": 206551318.0, + "step": 5412 + }, + { + "epoch": 0.6885892380104313, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8669160604476929, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8474623560905457, + "num_tokens": 206588173.0, + "step": 5413 + }, + { + "epoch": 0.6887164482890218, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0473928451538086, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8582723140716553, + "num_tokens": 206624013.0, + "step": 5414 + }, + { + "epoch": 0.6888436585676123, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.6980940103530884, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8660434484481812, + "num_tokens": 206668180.0, + "step": 5415 + }, + { + "epoch": 0.6889708688462027, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.768558144569397, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8541519641876221, + "num_tokens": 206710619.0, + "step": 5416 + }, + { + "epoch": 0.6890980791247933, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7541093826293945, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8641384840011597, + "num_tokens": 206750121.0, + "step": 5417 + }, + { + "epoch": 0.6892252894033838, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8092910051345825, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8711004853248596, + "num_tokens": 206789538.0, + "step": 5418 + }, + { + "epoch": 0.6893524996819743, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9335371255874634, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8597462773323059, + "num_tokens": 206827513.0, + "step": 5419 + }, + { + "epoch": 0.6894797099605648, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.150334596633911, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8654305934906006, + "num_tokens": 206866563.0, + "step": 5420 + }, + { + "epoch": 0.6896069202391554, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.9809603691101074, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8698422908782959, + "num_tokens": 206906281.0, + "step": 5421 + }, + { + "epoch": 0.6897341305177458, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.929266095161438, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8405930399894714, + "num_tokens": 206944055.0, + "step": 5422 + }, + { + "epoch": 0.6898613407963363, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.8925701379776, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8600995540618896, + "num_tokens": 206984421.0, + "step": 5423 + }, + { + "epoch": 0.6899885510749268, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.8922626972198486, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8478801846504211, + "num_tokens": 207020319.0, + "step": 5424 + }, + { + "epoch": 0.6901157613535174, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.7841925621032715, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8625098466873169, + "num_tokens": 207061487.0, + "step": 5425 + }, + { + "epoch": 0.6902429716321079, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.845599889755249, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8608056306838989, + "num_tokens": 207098343.0, + "step": 5426 + }, + { + "epoch": 0.6903701819106984, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8302695751190186, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.856219470500946, + "num_tokens": 207140381.0, + "step": 5427 + }, + { + "epoch": 0.6904973921892888, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.1175918579101562, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8725556135177612, + "num_tokens": 207173923.0, + "step": 5428 + }, + { + "epoch": 0.6906246024678794, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.8396434783935547, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8669660091400146, + "num_tokens": 207214840.0, + "step": 5429 + }, + { + "epoch": 0.6907518127464699, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9372868537902832, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8588795065879822, + "num_tokens": 207247850.0, + "step": 5430 + }, + { + "epoch": 0.6908790230250604, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8867403268814087, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8599867224693298, + "num_tokens": 207286698.0, + "step": 5431 + }, + { + "epoch": 0.691006233303651, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.826901912689209, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8541344404220581, + "num_tokens": 207322470.0, + "step": 5432 + }, + { + "epoch": 0.6911334435822415, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.8447105884552002, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8455727100372314, + "num_tokens": 207361517.0, + "step": 5433 + }, + { + "epoch": 0.6912606538608319, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9928958415985107, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8695900440216064, + "num_tokens": 207394504.0, + "step": 5434 + }, + { + "epoch": 0.6913878641394224, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9580926895141602, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.842095136642456, + "num_tokens": 207430148.0, + "step": 5435 + }, + { + "epoch": 0.691515074418013, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9592859745025635, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8575552701950073, + "num_tokens": 207461094.0, + "step": 5436 + }, + { + "epoch": 0.6916422846966035, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.333889961242676, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8665782809257507, + "num_tokens": 207500572.0, + "step": 5437 + }, + { + "epoch": 0.691769494975194, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.183849334716797, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.852493166923523, + "num_tokens": 207534977.0, + "step": 5438 + }, + { + "epoch": 0.6918967052537845, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.2160708904266357, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8645489811897278, + "num_tokens": 207569809.0, + "step": 5439 + }, + { + "epoch": 0.692023915532375, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9461495876312256, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8602739572525024, + "num_tokens": 207606785.0, + "step": 5440 + }, + { + "epoch": 0.6921511258109655, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8725666999816895, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8589318990707397, + "num_tokens": 207648344.0, + "step": 5441 + }, + { + "epoch": 0.692278336089556, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9173470735549927, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8680903911590576, + "num_tokens": 207680962.0, + "step": 5442 + }, + { + "epoch": 0.6924055463681466, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9791901111602783, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.892509937286377, + "num_tokens": 207709162.0, + "step": 5443 + }, + { + "epoch": 0.6925327566467371, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.862104058265686, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8553769588470459, + "num_tokens": 207746089.0, + "step": 5444 + }, + { + "epoch": 0.6926599669253276, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8069992065429688, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8711341619491577, + "num_tokens": 207791235.0, + "step": 5445 + }, + { + "epoch": 0.692787177203918, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0472843647003174, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8521684408187866, + "num_tokens": 207824527.0, + "step": 5446 + }, + { + "epoch": 0.6929143874825086, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8166230916976929, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8704010844230652, + "num_tokens": 207863535.0, + "step": 5447 + }, + { + "epoch": 0.6930415977610991, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.723532795906067, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8531386256217957, + "num_tokens": 207899694.0, + "step": 5448 + }, + { + "epoch": 0.6931688080396896, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.14167857170105, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8513144850730896, + "num_tokens": 207931440.0, + "step": 5449 + }, + { + "epoch": 0.6932960183182801, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.031979560852051, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8585683107376099, + "num_tokens": 207963758.0, + "step": 5450 + }, + { + "epoch": 0.6934232285968707, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.870678186416626, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8636505603790283, + "num_tokens": 208002383.0, + "step": 5451 + }, + { + "epoch": 0.6935504388754611, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8326750993728638, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8637453317642212, + "num_tokens": 208038723.0, + "step": 5452 + }, + { + "epoch": 0.6936776491540516, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9282236099243164, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8645622134208679, + "num_tokens": 208071851.0, + "step": 5453 + }, + { + "epoch": 0.6938048594326421, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.00692081451416, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8419731855392456, + "num_tokens": 208106916.0, + "step": 5454 + }, + { + "epoch": 0.6939320697112327, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9746267795562744, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8700404763221741, + "num_tokens": 208146883.0, + "step": 5455 + }, + { + "epoch": 0.6940592799898232, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0142853260040283, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8610888719558716, + "num_tokens": 208187364.0, + "step": 5456 + }, + { + "epoch": 0.6941864902684137, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7979224920272827, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8663797378540039, + "num_tokens": 208226087.0, + "step": 5457 + }, + { + "epoch": 0.6943137005470043, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.204195022583008, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8579504489898682, + "num_tokens": 208265193.0, + "step": 5458 + }, + { + "epoch": 0.6944409108255947, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7227050065994263, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8737568855285645, + "num_tokens": 208305163.0, + "step": 5459 + }, + { + "epoch": 0.6945681211041852, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8989572525024414, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8486542701721191, + "num_tokens": 208342229.0, + "step": 5460 + }, + { + "epoch": 0.6946953313827757, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8224084377288818, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8499226570129395, + "num_tokens": 208384606.0, + "step": 5461 + }, + { + "epoch": 0.6948225416613663, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.880980134010315, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8784143924713135, + "num_tokens": 208416777.0, + "step": 5462 + }, + { + "epoch": 0.6949497519399568, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8911752700805664, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8453887701034546, + "num_tokens": 208456101.0, + "step": 5463 + }, + { + "epoch": 0.6950769622185473, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9530415534973145, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8690188527107239, + "num_tokens": 208489201.0, + "step": 5464 + }, + { + "epoch": 0.6952041724971377, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9746787548065186, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8749285340309143, + "num_tokens": 208525006.0, + "step": 5465 + }, + { + "epoch": 0.6953313827757283, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8264122009277344, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8570405840873718, + "num_tokens": 208568997.0, + "step": 5466 + }, + { + "epoch": 0.6954585930543188, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0139050483703613, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8510710000991821, + "num_tokens": 208605925.0, + "step": 5467 + }, + { + "epoch": 0.6955858033329093, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9457844495773315, + "learning_rate": 1e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8346713185310364, + "num_tokens": 208646290.0, + "step": 5468 + }, + { + "epoch": 0.6957130136114998, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.089113712310791, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8603684306144714, + "num_tokens": 208686424.0, + "step": 5469 + }, + { + "epoch": 0.6958402238900904, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9071389436721802, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8617363572120667, + "num_tokens": 208719643.0, + "step": 5470 + }, + { + "epoch": 0.6959674341686808, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9178935289382935, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8447389006614685, + "num_tokens": 208760804.0, + "step": 5471 + }, + { + "epoch": 0.6960946444472713, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7950493097305298, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8546983003616333, + "num_tokens": 208803022.0, + "step": 5472 + }, + { + "epoch": 0.6962218547258618, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8306573629379272, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8606542348861694, + "num_tokens": 208844205.0, + "step": 5473 + }, + { + "epoch": 0.6963490650044524, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.6892905235290527, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8712904453277588, + "num_tokens": 208886882.0, + "step": 5474 + }, + { + "epoch": 0.6964762752830429, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.916042447090149, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8578683137893677, + "num_tokens": 208926649.0, + "step": 5475 + }, + { + "epoch": 0.6966034855616334, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.7898467779159546, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8539676070213318, + "num_tokens": 208963403.0, + "step": 5476 + }, + { + "epoch": 0.6967306958402238, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.9481029510498047, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8518691062927246, + "num_tokens": 209002475.0, + "step": 5477 + }, + { + "epoch": 0.6968579061188144, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.1225852966308594, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8594763278961182, + "num_tokens": 209044216.0, + "step": 5478 + }, + { + "epoch": 0.6969851163974049, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.229421377182007, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8496469259262085, + "num_tokens": 209080774.0, + "step": 5479 + }, + { + "epoch": 0.6971123266759954, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8502002954483032, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8658185005187988, + "num_tokens": 209117167.0, + "step": 5480 + }, + { + "epoch": 0.697239536954586, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0288455486297607, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8546534776687622, + "num_tokens": 209148872.0, + "step": 5481 + }, + { + "epoch": 0.6973667472331765, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.906376600265503, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8615390658378601, + "num_tokens": 209187912.0, + "step": 5482 + }, + { + "epoch": 0.6974939575117669, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.1397132873535156, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8653201460838318, + "num_tokens": 209227055.0, + "step": 5483 + }, + { + "epoch": 0.6976211677903574, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.1043014526367188, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8720448017120361, + "num_tokens": 209255343.0, + "step": 5484 + }, + { + "epoch": 0.697748378068948, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8785020112991333, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8526208400726318, + "num_tokens": 209294621.0, + "step": 5485 + }, + { + "epoch": 0.6978755883475385, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.854362964630127, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8764188289642334, + "num_tokens": 209336519.0, + "step": 5486 + }, + { + "epoch": 0.698002798626129, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9367377758026123, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8567421436309814, + "num_tokens": 209372913.0, + "step": 5487 + }, + { + "epoch": 0.6981300089047195, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8534656763076782, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8517758846282959, + "num_tokens": 209417768.0, + "step": 5488 + }, + { + "epoch": 0.69825721918331, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.087139129638672, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8622592091560364, + "num_tokens": 209451221.0, + "step": 5489 + }, + { + "epoch": 0.6983844294619005, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.00697660446167, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8591374158859253, + "num_tokens": 209488710.0, + "step": 5490 + }, + { + "epoch": 0.698511639740491, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9851598739624023, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8565123081207275, + "num_tokens": 209525166.0, + "step": 5491 + }, + { + "epoch": 0.6986388500190815, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.765596628189087, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8617264628410339, + "num_tokens": 209564804.0, + "step": 5492 + }, + { + "epoch": 0.6987660602976721, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8817952871322632, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.863064706325531, + "num_tokens": 209607155.0, + "step": 5493 + }, + { + "epoch": 0.6988932705762626, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0094621181488037, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8499743938446045, + "num_tokens": 209641226.0, + "step": 5494 + }, + { + "epoch": 0.699020480854853, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0766968727111816, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8569803237915039, + "num_tokens": 209679180.0, + "step": 5495 + }, + { + "epoch": 0.6991476911334435, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.875502347946167, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8522540926933289, + "num_tokens": 209716412.0, + "step": 5496 + }, + { + "epoch": 0.6992749014120341, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9854698181152344, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.850469708442688, + "num_tokens": 209755697.0, + "step": 5497 + }, + { + "epoch": 0.6994021116906246, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.011667490005493, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8658396005630493, + "num_tokens": 209793654.0, + "step": 5498 + }, + { + "epoch": 0.6995293219692151, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8850884437561035, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8573775887489319, + "num_tokens": 209830703.0, + "step": 5499 + }, + { + "epoch": 0.6996565322478057, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.939788818359375, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8595230579376221, + "num_tokens": 209868994.0, + "step": 5500 + }, + { + "epoch": 0.6997837425263961, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.1477444171905518, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8692421913146973, + "num_tokens": 209909605.0, + "step": 5501 + }, + { + "epoch": 0.6999109528049866, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.022284746170044, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8657869696617126, + "num_tokens": 209945196.0, + "step": 5502 + }, + { + "epoch": 0.7000381630835771, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.990119218826294, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8590509295463562, + "num_tokens": 209988585.0, + "step": 5503 + }, + { + "epoch": 0.7001653733621677, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8416212797164917, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8607052564620972, + "num_tokens": 210032084.0, + "step": 5504 + }, + { + "epoch": 0.7002925836407582, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8771651983261108, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8500788807868958, + "num_tokens": 210070639.0, + "step": 5505 + }, + { + "epoch": 0.7004197939193487, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9891201257705688, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8683295249938965, + "num_tokens": 210113793.0, + "step": 5506 + }, + { + "epoch": 0.7005470041979391, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9461456537246704, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8521791696548462, + "num_tokens": 210153517.0, + "step": 5507 + }, + { + "epoch": 0.7006742144765297, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.066714286804199, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8549894094467163, + "num_tokens": 210186818.0, + "step": 5508 + }, + { + "epoch": 0.7008014247551202, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9309861660003662, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8523908853530884, + "num_tokens": 210224283.0, + "step": 5509 + }, + { + "epoch": 0.7009286350337107, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9647573232650757, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.855529248714447, + "num_tokens": 210263439.0, + "step": 5510 + }, + { + "epoch": 0.7010558453123013, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.175276756286621, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8616556525230408, + "num_tokens": 210305887.0, + "step": 5511 + }, + { + "epoch": 0.7011830555908918, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.2061266899108887, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8589235544204712, + "num_tokens": 210340118.0, + "step": 5512 + }, + { + "epoch": 0.7013102658694823, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0437886714935303, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8597801923751831, + "num_tokens": 210376098.0, + "step": 5513 + }, + { + "epoch": 0.7014374761480727, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8410886526107788, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8638532757759094, + "num_tokens": 210418801.0, + "step": 5514 + }, + { + "epoch": 0.7015646864266633, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.962136149406433, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.849115788936615, + "num_tokens": 210452392.0, + "step": 5515 + }, + { + "epoch": 0.7016918967052538, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9375203847885132, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8673381805419922, + "num_tokens": 210489697.0, + "step": 5516 + }, + { + "epoch": 0.7018191069838443, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0288984775543213, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8440592288970947, + "num_tokens": 210530212.0, + "step": 5517 + }, + { + "epoch": 0.7019463172624348, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9946527481079102, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8660655617713928, + "num_tokens": 210564974.0, + "step": 5518 + }, + { + "epoch": 0.7020735275410254, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.974341869354248, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8598611354827881, + "num_tokens": 210601272.0, + "step": 5519 + }, + { + "epoch": 0.7022007378196158, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9588011503219604, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8619227409362793, + "num_tokens": 210636908.0, + "step": 5520 + }, + { + "epoch": 0.7023279480982063, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.960391879081726, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8569279313087463, + "num_tokens": 210672629.0, + "step": 5521 + }, + { + "epoch": 0.7024551583767968, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.0410735607147217, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8742534518241882, + "num_tokens": 210703531.0, + "step": 5522 + }, + { + "epoch": 0.7025823686553874, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7213973999023438, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8727240562438965, + "num_tokens": 210742946.0, + "step": 5523 + }, + { + "epoch": 0.7027095789339779, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.039314031600952, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8684396743774414, + "num_tokens": 210778345.0, + "step": 5524 + }, + { + "epoch": 0.7028367892125684, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8511101007461548, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8654050827026367, + "num_tokens": 210821544.0, + "step": 5525 + }, + { + "epoch": 0.7029639994911588, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9400787353515625, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8644912838935852, + "num_tokens": 210858060.0, + "step": 5526 + }, + { + "epoch": 0.7030912097697494, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8099318742752075, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8549774885177612, + "num_tokens": 210900365.0, + "step": 5527 + }, + { + "epoch": 0.7032184200483399, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.890635371208191, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8547759652137756, + "num_tokens": 210941118.0, + "step": 5528 + }, + { + "epoch": 0.7033456303269304, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.172626495361328, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8673686981201172, + "num_tokens": 210979865.0, + "step": 5529 + }, + { + "epoch": 0.703472840605521, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9099822044372559, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.860061526298523, + "num_tokens": 211016385.0, + "step": 5530 + }, + { + "epoch": 0.7036000508841115, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8974683284759521, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8474746942520142, + "num_tokens": 211059071.0, + "step": 5531 + }, + { + "epoch": 0.7037272611627019, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8326447010040283, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8623918294906616, + "num_tokens": 211094189.0, + "step": 5532 + }, + { + "epoch": 0.7038544714412924, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9351545572280884, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8733595609664917, + "num_tokens": 211130524.0, + "step": 5533 + }, + { + "epoch": 0.703981681719883, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8390334844589233, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.853224515914917, + "num_tokens": 211172335.0, + "step": 5534 + }, + { + "epoch": 0.7041088919984735, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9830621480941772, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8622300624847412, + "num_tokens": 211205406.0, + "step": 5535 + }, + { + "epoch": 0.704236102277064, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7767914533615112, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8556905388832092, + "num_tokens": 211246443.0, + "step": 5536 + }, + { + "epoch": 0.7043633125556545, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9582087993621826, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8734742403030396, + "num_tokens": 211286707.0, + "step": 5537 + }, + { + "epoch": 0.704490522834245, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9185516834259033, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8710250854492188, + "num_tokens": 211319448.0, + "step": 5538 + }, + { + "epoch": 0.7046177331128355, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8943084478378296, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8549046516418457, + "num_tokens": 211363747.0, + "step": 5539 + }, + { + "epoch": 0.704744943391426, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9014220237731934, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8662905693054199, + "num_tokens": 211408080.0, + "step": 5540 + }, + { + "epoch": 0.7048721536700165, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8165360689163208, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8498332500457764, + "num_tokens": 211445656.0, + "step": 5541 + }, + { + "epoch": 0.7049993639486071, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7217121124267578, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8681114912033081, + "num_tokens": 211484342.0, + "step": 5542 + }, + { + "epoch": 0.7051265742271976, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9147201776504517, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8553127646446228, + "num_tokens": 211518535.0, + "step": 5543 + }, + { + "epoch": 0.705253784505788, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8962730169296265, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8533498644828796, + "num_tokens": 211557236.0, + "step": 5544 + }, + { + "epoch": 0.7053809947843785, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7906612157821655, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8712413311004639, + "num_tokens": 211599216.0, + "step": 5545 + }, + { + "epoch": 0.7055082050629691, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8592184782028198, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8415136337280273, + "num_tokens": 211638930.0, + "step": 5546 + }, + { + "epoch": 0.7056354153415596, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8730627298355103, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.853056788444519, + "num_tokens": 211680765.0, + "step": 5547 + }, + { + "epoch": 0.7057626256201501, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9080030918121338, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8745285272598267, + "num_tokens": 211719037.0, + "step": 5548 + }, + { + "epoch": 0.7058898358987407, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.929803490638733, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8548441529273987, + "num_tokens": 211759655.0, + "step": 5549 + }, + { + "epoch": 0.7060170461773311, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.005232572555542, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8430353403091431, + "num_tokens": 211800405.0, + "step": 5550 + }, + { + "epoch": 0.7061442564559216, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7886854410171509, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8419643640518188, + "num_tokens": 211846057.0, + "step": 5551 + }, + { + "epoch": 0.7062714667345121, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9352195262908936, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8574798107147217, + "num_tokens": 211885944.0, + "step": 5552 + }, + { + "epoch": 0.7063986770131027, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6882187128067017, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.864802360534668, + "num_tokens": 211928727.0, + "step": 5553 + }, + { + "epoch": 0.7065258872916932, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7961598634719849, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8499441146850586, + "num_tokens": 211967593.0, + "step": 5554 + }, + { + "epoch": 0.7066530975702837, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.1105430126190186, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.853789746761322, + "num_tokens": 212001023.0, + "step": 5555 + }, + { + "epoch": 0.7067803078488741, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9757320880889893, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8505823612213135, + "num_tokens": 212045611.0, + "step": 5556 + }, + { + "epoch": 0.7069075181274647, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9512561559677124, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8555080890655518, + "num_tokens": 212087167.0, + "step": 5557 + }, + { + "epoch": 0.7070347284060552, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.2275092601776123, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8607510924339294, + "num_tokens": 212124254.0, + "step": 5558 + }, + { + "epoch": 0.7071619386846457, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8354483842849731, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8534411191940308, + "num_tokens": 212162706.0, + "step": 5559 + }, + { + "epoch": 0.7072891489632362, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8951280117034912, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.858885645866394, + "num_tokens": 212197359.0, + "step": 5560 + }, + { + "epoch": 0.7074163592418268, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.77290940284729, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8663960695266724, + "num_tokens": 212237300.0, + "step": 5561 + }, + { + "epoch": 0.7075435695204173, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6854922771453857, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8398633003234863, + "num_tokens": 212283472.0, + "step": 5562 + }, + { + "epoch": 0.7076707797990077, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8594938516616821, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8546907901763916, + "num_tokens": 212321433.0, + "step": 5563 + }, + { + "epoch": 0.7077979900775982, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7916024923324585, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.857140064239502, + "num_tokens": 212362234.0, + "step": 5564 + }, + { + "epoch": 0.7079252003561888, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.057830333709717, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8503328561782837, + "num_tokens": 212399259.0, + "step": 5565 + }, + { + "epoch": 0.7080524106347793, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.864295482635498, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8597877621650696, + "num_tokens": 212438489.0, + "step": 5566 + }, + { + "epoch": 0.7081796209133698, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8232184648513794, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8509619832038879, + "num_tokens": 212482283.0, + "step": 5567 + }, + { + "epoch": 0.7083068311919604, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.807810664176941, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8535906076431274, + "num_tokens": 212522749.0, + "step": 5568 + }, + { + "epoch": 0.7084340414705508, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.82426118850708, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8505773544311523, + "num_tokens": 212559937.0, + "step": 5569 + }, + { + "epoch": 0.7085612517491413, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.351439952850342, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8640225529670715, + "num_tokens": 212595578.0, + "step": 5570 + }, + { + "epoch": 0.7086884620277318, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.087165355682373, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8701353073120117, + "num_tokens": 212634085.0, + "step": 5571 + }, + { + "epoch": 0.7088156723063224, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.950386643409729, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8494099974632263, + "num_tokens": 212673557.0, + "step": 5572 + }, + { + "epoch": 0.7089428825849129, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9633594751358032, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8685257434844971, + "num_tokens": 212711142.0, + "step": 5573 + }, + { + "epoch": 0.7090700928635034, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9455870389938354, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8594969511032104, + "num_tokens": 212745570.0, + "step": 5574 + }, + { + "epoch": 0.7091973031420938, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8862403631210327, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8619734048843384, + "num_tokens": 212783048.0, + "step": 5575 + }, + { + "epoch": 0.7093245134206844, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.857800841331482, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8762248754501343, + "num_tokens": 212821231.0, + "step": 5576 + }, + { + "epoch": 0.7094517236992749, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8594502210617065, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8562321662902832, + "num_tokens": 212861715.0, + "step": 5577 + }, + { + "epoch": 0.7095789339778654, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9720661640167236, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8496683239936829, + "num_tokens": 212900655.0, + "step": 5578 + }, + { + "epoch": 0.709706144256456, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.963066816329956, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8514752388000488, + "num_tokens": 212939308.0, + "step": 5579 + }, + { + "epoch": 0.7098333545350465, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7979124784469604, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8634688854217529, + "num_tokens": 212981174.0, + "step": 5580 + }, + { + "epoch": 0.7099605648136369, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8300565481185913, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8688719868659973, + "num_tokens": 213016401.0, + "step": 5581 + }, + { + "epoch": 0.7100877750922274, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8489989042282104, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.846042811870575, + "num_tokens": 213058480.0, + "step": 5582 + }, + { + "epoch": 0.710214985370818, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.967406153678894, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8704036474227905, + "num_tokens": 213093527.0, + "step": 5583 + }, + { + "epoch": 0.7103421956494085, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8486288785934448, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8564721941947937, + "num_tokens": 213126778.0, + "step": 5584 + }, + { + "epoch": 0.710469405927999, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.796929121017456, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8618900179862976, + "num_tokens": 213170863.0, + "step": 5585 + }, + { + "epoch": 0.7105966162065895, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.349968671798706, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.835658073425293, + "num_tokens": 213211336.0, + "step": 5586 + }, + { + "epoch": 0.71072382648518, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0629868507385254, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8653159141540527, + "num_tokens": 213245262.0, + "step": 5587 + }, + { + "epoch": 0.7108510367637705, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0220234394073486, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8499423265457153, + "num_tokens": 213282150.0, + "step": 5588 + }, + { + "epoch": 0.710978247042361, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7555159330368042, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8638011813163757, + "num_tokens": 213325573.0, + "step": 5589 + }, + { + "epoch": 0.7111054573209515, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8884013891220093, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.866813063621521, + "num_tokens": 213363738.0, + "step": 5590 + }, + { + "epoch": 0.7112326675995421, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0321617126464844, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8643828630447388, + "num_tokens": 213404796.0, + "step": 5591 + }, + { + "epoch": 0.7113598778781326, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9339582920074463, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8516567945480347, + "num_tokens": 213436711.0, + "step": 5592 + }, + { + "epoch": 0.711487088156723, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9597419500350952, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8623862862586975, + "num_tokens": 213473458.0, + "step": 5593 + }, + { + "epoch": 0.7116142984353135, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8930377960205078, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8696126341819763, + "num_tokens": 213507355.0, + "step": 5594 + }, + { + "epoch": 0.7117415087139041, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0524561405181885, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8415282368659973, + "num_tokens": 213543771.0, + "step": 5595 + }, + { + "epoch": 0.7118687189924946, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.868822455406189, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8558233380317688, + "num_tokens": 213578317.0, + "step": 5596 + }, + { + "epoch": 0.7119959292710851, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9288475513458252, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8558515310287476, + "num_tokens": 213615795.0, + "step": 5597 + }, + { + "epoch": 0.7121231395496757, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9300625324249268, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8690629005432129, + "num_tokens": 213655452.0, + "step": 5598 + }, + { + "epoch": 0.7122503498282661, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.1667611598968506, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.85041344165802, + "num_tokens": 213688079.0, + "step": 5599 + }, + { + "epoch": 0.7123775601068566, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8178331851959229, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8757924437522888, + "num_tokens": 213725310.0, + "step": 5600 + }, + { + "epoch": 0.7125047703854471, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0291495323181152, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8543822765350342, + "num_tokens": 213757950.0, + "step": 5601 + }, + { + "epoch": 0.7126319806640377, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.005601406097412, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8477969169616699, + "num_tokens": 213794146.0, + "step": 5602 + }, + { + "epoch": 0.7127591909426282, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.885396957397461, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8642452359199524, + "num_tokens": 213834922.0, + "step": 5603 + }, + { + "epoch": 0.7128864012212187, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7848844528198242, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8779021501541138, + "num_tokens": 213879651.0, + "step": 5604 + }, + { + "epoch": 0.7130136114998091, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.779745101928711, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8579189777374268, + "num_tokens": 213921833.0, + "step": 5605 + }, + { + "epoch": 0.7131408217783997, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8436120748519897, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.864826500415802, + "num_tokens": 213962909.0, + "step": 5606 + }, + { + "epoch": 0.7132680320569902, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7588510513305664, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.861722469329834, + "num_tokens": 214002030.0, + "step": 5607 + }, + { + "epoch": 0.7133952423355807, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.121288299560547, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8532624840736389, + "num_tokens": 214037094.0, + "step": 5608 + }, + { + "epoch": 0.7135224526141712, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9082084894180298, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8639780282974243, + "num_tokens": 214076649.0, + "step": 5609 + }, + { + "epoch": 0.7136496628927618, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8923133611679077, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8538587689399719, + "num_tokens": 214114194.0, + "step": 5610 + }, + { + "epoch": 0.7137768731713523, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9611239433288574, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8611950874328613, + "num_tokens": 214149652.0, + "step": 5611 + }, + { + "epoch": 0.7139040834499427, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7654874324798584, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8686541318893433, + "num_tokens": 214189068.0, + "step": 5612 + }, + { + "epoch": 0.7140312937285332, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0771484375, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8484539985656738, + "num_tokens": 214220251.0, + "step": 5613 + }, + { + "epoch": 0.7141585040071238, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.3891143798828125, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8573704957962036, + "num_tokens": 214256705.0, + "step": 5614 + }, + { + "epoch": 0.7142857142857143, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8871684074401855, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8721388578414917, + "num_tokens": 214292563.0, + "step": 5615 + }, + { + "epoch": 0.7144129245643048, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7945290803909302, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.85312819480896, + "num_tokens": 214333843.0, + "step": 5616 + }, + { + "epoch": 0.7145401348428954, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8730592727661133, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8696008324623108, + "num_tokens": 214372272.0, + "step": 5617 + }, + { + "epoch": 0.7146673451214858, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7167606353759766, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8653628826141357, + "num_tokens": 214416364.0, + "step": 5618 + }, + { + "epoch": 0.7147945554000763, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8948863744735718, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8582298159599304, + "num_tokens": 214453102.0, + "step": 5619 + }, + { + "epoch": 0.7149217656786668, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7691309452056885, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8642871975898743, + "num_tokens": 214492374.0, + "step": 5620 + }, + { + "epoch": 0.7150489759572574, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 80.52189636230469, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8593396544456482, + "num_tokens": 214530918.0, + "step": 5621 + }, + { + "epoch": 0.7151761862358479, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9654924869537354, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8755084276199341, + "num_tokens": 214570227.0, + "step": 5622 + }, + { + "epoch": 0.7153033965144384, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9263341426849365, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8630739450454712, + "num_tokens": 214609519.0, + "step": 5623 + }, + { + "epoch": 0.7154306067930288, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.021040439605713, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8574246168136597, + "num_tokens": 214643564.0, + "step": 5624 + }, + { + "epoch": 0.7155578170716194, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9798164367675781, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.867279052734375, + "num_tokens": 214677632.0, + "step": 5625 + }, + { + "epoch": 0.7156850273502099, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8989518880844116, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8615041375160217, + "num_tokens": 214710363.0, + "step": 5626 + }, + { + "epoch": 0.7158122376288004, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9261459112167358, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8668548464775085, + "num_tokens": 214746982.0, + "step": 5627 + }, + { + "epoch": 0.715939447907391, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9329816102981567, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.85120689868927, + "num_tokens": 214786447.0, + "step": 5628 + }, + { + "epoch": 0.7160666581859815, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.963904857635498, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8676875829696655, + "num_tokens": 214824627.0, + "step": 5629 + }, + { + "epoch": 0.7161938684645719, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9668593406677246, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8416265249252319, + "num_tokens": 214863051.0, + "step": 5630 + }, + { + "epoch": 0.7163210787431624, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.093500852584839, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8467167615890503, + "num_tokens": 214900524.0, + "step": 5631 + }, + { + "epoch": 0.716448289021753, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8886810541152954, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8658421635627747, + "num_tokens": 214943949.0, + "step": 5632 + }, + { + "epoch": 0.7165754993003435, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.964741826057434, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8634876012802124, + "num_tokens": 214979126.0, + "step": 5633 + }, + { + "epoch": 0.716702709578934, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8544951677322388, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.861015796661377, + "num_tokens": 215017740.0, + "step": 5634 + }, + { + "epoch": 0.7168299198575245, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8594204187393188, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8674965500831604, + "num_tokens": 215054472.0, + "step": 5635 + }, + { + "epoch": 0.716957130136115, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9087165594100952, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8629428744316101, + "num_tokens": 215086894.0, + "step": 5636 + }, + { + "epoch": 0.7170843404147055, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8744021654129028, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.860506534576416, + "num_tokens": 215128157.0, + "step": 5637 + }, + { + "epoch": 0.717211550693296, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 3.211956024169922, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8672438263893127, + "num_tokens": 215161008.0, + "step": 5638 + }, + { + "epoch": 0.7173387609718865, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.874074697494507, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8432981967926025, + "num_tokens": 215197157.0, + "step": 5639 + }, + { + "epoch": 0.7174659712504771, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.003371477127075, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8662932515144348, + "num_tokens": 215240270.0, + "step": 5640 + }, + { + "epoch": 0.7175931815290676, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7627966403961182, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8681608438491821, + "num_tokens": 215279828.0, + "step": 5641 + }, + { + "epoch": 0.717720391807658, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8361546993255615, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8549558520317078, + "num_tokens": 215315271.0, + "step": 5642 + }, + { + "epoch": 0.7178476020862485, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9336755275726318, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8615502119064331, + "num_tokens": 215352621.0, + "step": 5643 + }, + { + "epoch": 0.7179748123648391, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7811917066574097, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8620367050170898, + "num_tokens": 215390231.0, + "step": 5644 + }, + { + "epoch": 0.7181020226434296, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8092405796051025, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8745908141136169, + "num_tokens": 215426812.0, + "step": 5645 + }, + { + "epoch": 0.7182292329220201, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9359827041625977, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8493114113807678, + "num_tokens": 215467551.0, + "step": 5646 + }, + { + "epoch": 0.7183564432006107, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.8174816370010376, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8687143325805664, + "num_tokens": 215508559.0, + "step": 5647 + }, + { + "epoch": 0.7184836534792011, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.853393793106079, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8542807102203369, + "num_tokens": 215548709.0, + "step": 5648 + }, + { + "epoch": 0.7186108637577916, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.7393786907196045, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8607974052429199, + "num_tokens": 215593844.0, + "step": 5649 + }, + { + "epoch": 0.7187380740363821, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.899999976158142, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8651180863380432, + "num_tokens": 215633853.0, + "step": 5650 + }, + { + "epoch": 0.7188652843149727, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 1.9506447315216064, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8589795827865601, + "num_tokens": 215667272.0, + "step": 5651 + }, + { + "epoch": 0.7189924945935632, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.2298622131347656, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8420900106430054, + "num_tokens": 215705889.0, + "step": 5652 + }, + { + "epoch": 0.7191197048721537, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0596158504486084, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8697931170463562, + "num_tokens": 215749143.0, + "step": 5653 + }, + { + "epoch": 0.7192469151507441, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8928683996200562, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8553997278213501, + "num_tokens": 215787583.0, + "step": 5654 + }, + { + "epoch": 0.7193741254293347, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9565603733062744, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8599731922149658, + "num_tokens": 215825552.0, + "step": 5655 + }, + { + "epoch": 0.7195013357079252, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0161521434783936, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8700389266014099, + "num_tokens": 215858310.0, + "step": 5656 + }, + { + "epoch": 0.7196285459865157, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.2442734241485596, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8491154909133911, + "num_tokens": 215898403.0, + "step": 5657 + }, + { + "epoch": 0.7197557562651062, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.925228238105774, + "learning_rate": 1e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.8367441892623901, + "num_tokens": 215935930.0, + "step": 5658 + }, + { + "epoch": 0.7198829665436968, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8871455192565918, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8564702272415161, + "num_tokens": 215971509.0, + "step": 5659 + }, + { + "epoch": 0.7200101768222873, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8662039041519165, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8669143915176392, + "num_tokens": 216008696.0, + "step": 5660 + }, + { + "epoch": 0.7201373871008777, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8291970491409302, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.860744833946228, + "num_tokens": 216045861.0, + "step": 5661 + }, + { + "epoch": 0.7202645973794682, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.99229896068573, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8697615265846252, + "num_tokens": 216081307.0, + "step": 5662 + }, + { + "epoch": 0.7203918076580588, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.1784415245056152, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8708265423774719, + "num_tokens": 216118506.0, + "step": 5663 + }, + { + "epoch": 0.7205190179366493, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.824468731880188, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.852971076965332, + "num_tokens": 216159000.0, + "step": 5664 + }, + { + "epoch": 0.7206462282152398, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8615262508392334, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8529648184776306, + "num_tokens": 216204712.0, + "step": 5665 + }, + { + "epoch": 0.7207734384938304, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.045398712158203, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8523919582366943, + "num_tokens": 216235001.0, + "step": 5666 + }, + { + "epoch": 0.7209006487724208, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.308992385864258, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8535302877426147, + "num_tokens": 216271383.0, + "step": 5667 + }, + { + "epoch": 0.7210278590510113, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8526023626327515, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8612589836120605, + "num_tokens": 216307923.0, + "step": 5668 + }, + { + "epoch": 0.7211550693296018, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.729699969291687, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8790221214294434, + "num_tokens": 216345147.0, + "step": 5669 + }, + { + "epoch": 0.7212822796081924, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0006673336029053, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8521970510482788, + "num_tokens": 216386991.0, + "step": 5670 + }, + { + "epoch": 0.7214094898867829, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 3.0527493953704834, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8630548119544983, + "num_tokens": 216425534.0, + "step": 5671 + }, + { + "epoch": 0.7215367001653734, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8128008842468262, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8601564764976501, + "num_tokens": 216466686.0, + "step": 5672 + }, + { + "epoch": 0.7216639104439638, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7571998834609985, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8720728158950806, + "num_tokens": 216505245.0, + "step": 5673 + }, + { + "epoch": 0.7217911207225544, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.118762731552124, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8568200469017029, + "num_tokens": 216538024.0, + "step": 5674 + }, + { + "epoch": 0.7219183310011449, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9868237972259521, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8590325117111206, + "num_tokens": 216573722.0, + "step": 5675 + }, + { + "epoch": 0.7220455412797354, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.3113296031951904, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8654296398162842, + "num_tokens": 216610362.0, + "step": 5676 + }, + { + "epoch": 0.7221727515583259, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9088129997253418, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8530006408691406, + "num_tokens": 216648732.0, + "step": 5677 + }, + { + "epoch": 0.7222999618369165, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9601950645446777, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8688876628875732, + "num_tokens": 216680324.0, + "step": 5678 + }, + { + "epoch": 0.7224271721155069, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.197213888168335, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8625023365020752, + "num_tokens": 216712773.0, + "step": 5679 + }, + { + "epoch": 0.7225543823940974, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.3326239585876465, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8532404899597168, + "num_tokens": 216752538.0, + "step": 5680 + }, + { + "epoch": 0.722681592672688, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0290627479553223, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8636400103569031, + "num_tokens": 216791795.0, + "step": 5681 + }, + { + "epoch": 0.7228088029512785, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9639827013015747, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8573729395866394, + "num_tokens": 216831729.0, + "step": 5682 + }, + { + "epoch": 0.722936013229869, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8466999530792236, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.860815703868866, + "num_tokens": 216869494.0, + "step": 5683 + }, + { + "epoch": 0.7230632235084595, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7919095754623413, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8417239189147949, + "num_tokens": 216906672.0, + "step": 5684 + }, + { + "epoch": 0.72319043378705, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.256479263305664, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8665257692337036, + "num_tokens": 216941587.0, + "step": 5685 + }, + { + "epoch": 0.7233176440656405, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8232322931289673, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8540735244750977, + "num_tokens": 216981812.0, + "step": 5686 + }, + { + "epoch": 0.723444854344231, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.838460922241211, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8705539703369141, + "num_tokens": 217024137.0, + "step": 5687 + }, + { + "epoch": 0.7235720646228215, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8637617826461792, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8731060028076172, + "num_tokens": 217063753.0, + "step": 5688 + }, + { + "epoch": 0.7236992749014121, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.68276846408844, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8578870296478271, + "num_tokens": 217105611.0, + "step": 5689 + }, + { + "epoch": 0.7238264851800026, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9496901035308838, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.84074866771698, + "num_tokens": 217145989.0, + "step": 5690 + }, + { + "epoch": 0.723953695458593, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.000378131866455, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8777415752410889, + "num_tokens": 217183726.0, + "step": 5691 + }, + { + "epoch": 0.7240809057371835, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9026137590408325, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8577814102172852, + "num_tokens": 217226644.0, + "step": 5692 + }, + { + "epoch": 0.7242081160157741, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.078425407409668, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.846031665802002, + "num_tokens": 217263327.0, + "step": 5693 + }, + { + "epoch": 0.7243353262943646, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8425524234771729, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8649778962135315, + "num_tokens": 217300595.0, + "step": 5694 + }, + { + "epoch": 0.7244625365729551, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.2940142154693604, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8471150398254395, + "num_tokens": 217336385.0, + "step": 5695 + }, + { + "epoch": 0.7245897468515456, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.966179609298706, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8673600554466248, + "num_tokens": 217374309.0, + "step": 5696 + }, + { + "epoch": 0.7247169571301361, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9886614084243774, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8502862453460693, + "num_tokens": 217408386.0, + "step": 5697 + }, + { + "epoch": 0.7248441674087266, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9234774112701416, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8470678329467773, + "num_tokens": 217442855.0, + "step": 5698 + }, + { + "epoch": 0.7249713776873171, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.848164677619934, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8652364611625671, + "num_tokens": 217476892.0, + "step": 5699 + }, + { + "epoch": 0.7250985879659076, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9103684425354004, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8587881326675415, + "num_tokens": 217513569.0, + "step": 5700 + }, + { + "epoch": 0.7252257982444982, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.036795139312744, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8555393218994141, + "num_tokens": 217545585.0, + "step": 5701 + }, + { + "epoch": 0.7253530085230887, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9148600101470947, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.863206148147583, + "num_tokens": 217583156.0, + "step": 5702 + }, + { + "epoch": 0.7254802188016791, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.021365165710449, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8615953922271729, + "num_tokens": 217619630.0, + "step": 5703 + }, + { + "epoch": 0.7256074290802697, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8851051330566406, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8704962134361267, + "num_tokens": 217653616.0, + "step": 5704 + }, + { + "epoch": 0.7257346393588602, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0055606365203857, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8556352853775024, + "num_tokens": 217692758.0, + "step": 5705 + }, + { + "epoch": 0.7258618496374507, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9505672454833984, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8444318175315857, + "num_tokens": 217734175.0, + "step": 5706 + }, + { + "epoch": 0.7259890599160412, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9903433322906494, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8389931917190552, + "num_tokens": 217773372.0, + "step": 5707 + }, + { + "epoch": 0.7261162701946318, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.126124382019043, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8690483570098877, + "num_tokens": 217809174.0, + "step": 5708 + }, + { + "epoch": 0.7262434804732223, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8629482984542847, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8539211750030518, + "num_tokens": 217845882.0, + "step": 5709 + }, + { + "epoch": 0.7263706907518127, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8718526363372803, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8554088473320007, + "num_tokens": 217885798.0, + "step": 5710 + }, + { + "epoch": 0.7264979010304032, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1363635063171387, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8570895195007324, + "num_tokens": 217924967.0, + "step": 5711 + }, + { + "epoch": 0.7266251113089938, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8743164539337158, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.870707631111145, + "num_tokens": 217967381.0, + "step": 5712 + }, + { + "epoch": 0.7267523215875843, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.933685302734375, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8665732145309448, + "num_tokens": 218001097.0, + "step": 5713 + }, + { + "epoch": 0.7268795318661748, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.793738603591919, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.862122654914856, + "num_tokens": 218040705.0, + "step": 5714 + }, + { + "epoch": 0.7270067421447653, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.8805150985717773, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.860052227973938, + "num_tokens": 218077549.0, + "step": 5715 + }, + { + "epoch": 0.7271339524233558, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9601993560791016, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8602285385131836, + "num_tokens": 218116251.0, + "step": 5716 + }, + { + "epoch": 0.7272611627019463, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.2600290775299072, + "learning_rate": 1e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.8335331082344055, + "num_tokens": 218155798.0, + "step": 5717 + }, + { + "epoch": 0.7273883729805368, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.856105089187622, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8660738468170166, + "num_tokens": 218191277.0, + "step": 5718 + }, + { + "epoch": 0.7275155832591274, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.954035758972168, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8660057783126831, + "num_tokens": 218233165.0, + "step": 5719 + }, + { + "epoch": 0.7276427935377179, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9450281858444214, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8396395444869995, + "num_tokens": 218271263.0, + "step": 5720 + }, + { + "epoch": 0.7277700038163084, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0310721397399902, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.861107587814331, + "num_tokens": 218309081.0, + "step": 5721 + }, + { + "epoch": 0.7278972140948988, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1447668075561523, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8550307154655457, + "num_tokens": 218348314.0, + "step": 5722 + }, + { + "epoch": 0.7280244243734894, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8119155168533325, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8645321130752563, + "num_tokens": 218387556.0, + "step": 5723 + }, + { + "epoch": 0.7281516346520799, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8973602056503296, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8520053625106812, + "num_tokens": 218423130.0, + "step": 5724 + }, + { + "epoch": 0.7282788449306704, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0050346851348877, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8422731161117554, + "num_tokens": 218464079.0, + "step": 5725 + }, + { + "epoch": 0.7284060552092609, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.004276990890503, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8680776357650757, + "num_tokens": 218501307.0, + "step": 5726 + }, + { + "epoch": 0.7285332654878515, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8295679092407227, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8679465651512146, + "num_tokens": 218537710.0, + "step": 5727 + }, + { + "epoch": 0.7286604757664419, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.847285270690918, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8557391166687012, + "num_tokens": 218576026.0, + "step": 5728 + }, + { + "epoch": 0.7287876860450324, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8032552003860474, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.857059121131897, + "num_tokens": 218612491.0, + "step": 5729 + }, + { + "epoch": 0.7289148963236229, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.377098560333252, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8683295249938965, + "num_tokens": 218642541.0, + "step": 5730 + }, + { + "epoch": 0.7290421066022135, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.3424570560455322, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8463830351829529, + "num_tokens": 218678514.0, + "step": 5731 + }, + { + "epoch": 0.729169316880804, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8585662841796875, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8639389276504517, + "num_tokens": 218715415.0, + "step": 5732 + }, + { + "epoch": 0.7292965271593945, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7280381917953491, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8675600290298462, + "num_tokens": 218754422.0, + "step": 5733 + }, + { + "epoch": 0.7294237374379849, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.287429094314575, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8514516353607178, + "num_tokens": 218787141.0, + "step": 5734 + }, + { + "epoch": 0.7295509477165755, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8441073894500732, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8747162222862244, + "num_tokens": 218824764.0, + "step": 5735 + }, + { + "epoch": 0.729678157995166, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.025113105773926, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8583125472068787, + "num_tokens": 218866142.0, + "step": 5736 + }, + { + "epoch": 0.7298053682737565, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.891486406326294, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8549593687057495, + "num_tokens": 218909208.0, + "step": 5737 + }, + { + "epoch": 0.7299325785523471, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.051898956298828, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8676309585571289, + "num_tokens": 218942469.0, + "step": 5738 + }, + { + "epoch": 0.7300597888309376, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9178191423416138, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8531359434127808, + "num_tokens": 218982934.0, + "step": 5739 + }, + { + "epoch": 0.730186999109528, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7851048707962036, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8561989068984985, + "num_tokens": 219027055.0, + "step": 5740 + }, + { + "epoch": 0.7303142093881185, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1239256858825684, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8607663512229919, + "num_tokens": 219069639.0, + "step": 5741 + }, + { + "epoch": 0.7304414196667091, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9310029745101929, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8538503646850586, + "num_tokens": 219111774.0, + "step": 5742 + }, + { + "epoch": 0.7305686299452996, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8018039464950562, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8475689888000488, + "num_tokens": 219156962.0, + "step": 5743 + }, + { + "epoch": 0.7306958402238901, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.0909173488616943, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8685296773910522, + "num_tokens": 219193352.0, + "step": 5744 + }, + { + "epoch": 0.7308230505024806, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0022668838500977, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8460187911987305, + "num_tokens": 219230041.0, + "step": 5745 + }, + { + "epoch": 0.7309502607810711, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.146822452545166, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8373160362243652, + "num_tokens": 219262748.0, + "step": 5746 + }, + { + "epoch": 0.7310774710596616, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.3668224811553955, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.872883677482605, + "num_tokens": 219296556.0, + "step": 5747 + }, + { + "epoch": 0.7312046813382521, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0094892978668213, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8470501899719238, + "num_tokens": 219330801.0, + "step": 5748 + }, + { + "epoch": 0.7313318916168426, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.061163902282715, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8594347834587097, + "num_tokens": 219367470.0, + "step": 5749 + }, + { + "epoch": 0.7314591018954332, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9887641668319702, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8463312387466431, + "num_tokens": 219405291.0, + "step": 5750 + }, + { + "epoch": 0.7315863121740237, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8145084381103516, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8618172407150269, + "num_tokens": 219443365.0, + "step": 5751 + }, + { + "epoch": 0.7317135224526141, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7987507581710815, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8615477681159973, + "num_tokens": 219477524.0, + "step": 5752 + }, + { + "epoch": 0.7318407327312046, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8826324939727783, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8501495122909546, + "num_tokens": 219514576.0, + "step": 5753 + }, + { + "epoch": 0.7319679430097952, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0861029624938965, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8608673810958862, + "num_tokens": 219558253.0, + "step": 5754 + }, + { + "epoch": 0.7320951532883857, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9287984371185303, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8553421497344971, + "num_tokens": 219601156.0, + "step": 5755 + }, + { + "epoch": 0.7322223635669762, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9173109531402588, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8499272465705872, + "num_tokens": 219638241.0, + "step": 5756 + }, + { + "epoch": 0.7323495738455668, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8556852340698242, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8609631061553955, + "num_tokens": 219678168.0, + "step": 5757 + }, + { + "epoch": 0.7324767841241572, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8458333015441895, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8569120168685913, + "num_tokens": 219718115.0, + "step": 5758 + }, + { + "epoch": 0.7326039944027477, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9472453594207764, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8592103719711304, + "num_tokens": 219754221.0, + "step": 5759 + }, + { + "epoch": 0.7327312046813382, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.0097033977508545, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8646312952041626, + "num_tokens": 219786880.0, + "step": 5760 + }, + { + "epoch": 0.7328584149599288, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.3671905994415283, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8626828193664551, + "num_tokens": 219830585.0, + "step": 5761 + }, + { + "epoch": 0.7329856252385193, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9330964088439941, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8634133338928223, + "num_tokens": 219867249.0, + "step": 5762 + }, + { + "epoch": 0.7331128355171098, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8724397420883179, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.851121187210083, + "num_tokens": 219906046.0, + "step": 5763 + }, + { + "epoch": 0.7332400457957003, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8359363079071045, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8506374359130859, + "num_tokens": 219947466.0, + "step": 5764 + }, + { + "epoch": 0.7333672560742908, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9554998874664307, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.861889123916626, + "num_tokens": 219985698.0, + "step": 5765 + }, + { + "epoch": 0.7334944663528813, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7487601041793823, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8599201440811157, + "num_tokens": 220026158.0, + "step": 5766 + }, + { + "epoch": 0.7336216766314718, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7991629838943481, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8575060963630676, + "num_tokens": 220071849.0, + "step": 5767 + }, + { + "epoch": 0.7337488869100623, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9252055883407593, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8651530146598816, + "num_tokens": 220110255.0, + "step": 5768 + }, + { + "epoch": 0.7338760971886529, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9184496402740479, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8430113196372986, + "num_tokens": 220144962.0, + "step": 5769 + }, + { + "epoch": 0.7340033074672434, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.3004257678985596, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8486524820327759, + "num_tokens": 220182118.0, + "step": 5770 + }, + { + "epoch": 0.7341305177458338, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7890876531600952, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8507921695709229, + "num_tokens": 220218954.0, + "step": 5771 + }, + { + "epoch": 0.7342577280244243, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9365806579589844, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8582280278205872, + "num_tokens": 220259116.0, + "step": 5772 + }, + { + "epoch": 0.7343849383030149, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.8315634727478027, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8561386466026306, + "num_tokens": 220301742.0, + "step": 5773 + }, + { + "epoch": 0.7345121485816054, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8258893489837646, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8630402088165283, + "num_tokens": 220342572.0, + "step": 5774 + }, + { + "epoch": 0.7346393588601959, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.012622356414795, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8645486831665039, + "num_tokens": 220376373.0, + "step": 5775 + }, + { + "epoch": 0.7347665691387865, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7304600477218628, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8758795261383057, + "num_tokens": 220413290.0, + "step": 5776 + }, + { + "epoch": 0.7348937794173769, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7220909595489502, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8559908866882324, + "num_tokens": 220453219.0, + "step": 5777 + }, + { + "epoch": 0.7350209896959674, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8616682291030884, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8598967790603638, + "num_tokens": 220488369.0, + "step": 5778 + }, + { + "epoch": 0.7351481999745579, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.4868063926696777, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8633999824523926, + "num_tokens": 220527798.0, + "step": 5779 + }, + { + "epoch": 0.7352754102531485, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.893052101135254, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8763608932495117, + "num_tokens": 220565398.0, + "step": 5780 + }, + { + "epoch": 0.735402620531739, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7463099956512451, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8696293234825134, + "num_tokens": 220603903.0, + "step": 5781 + }, + { + "epoch": 0.7355298308103295, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8144075870513916, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8411243557929993, + "num_tokens": 220644168.0, + "step": 5782 + }, + { + "epoch": 0.7356570410889199, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8636009693145752, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.871943473815918, + "num_tokens": 220679709.0, + "step": 5783 + }, + { + "epoch": 0.7357842513675105, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7840895652770996, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8527001738548279, + "num_tokens": 220721184.0, + "step": 5784 + }, + { + "epoch": 0.735911461646101, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.768411874771118, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.863406777381897, + "num_tokens": 220755027.0, + "step": 5785 + }, + { + "epoch": 0.7360386719246915, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0127155780792236, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8354459404945374, + "num_tokens": 220793597.0, + "step": 5786 + }, + { + "epoch": 0.736165882203282, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8475475311279297, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8587582111358643, + "num_tokens": 220833549.0, + "step": 5787 + }, + { + "epoch": 0.7362930924818726, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9718005657196045, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8657255172729492, + "num_tokens": 220865551.0, + "step": 5788 + }, + { + "epoch": 0.736420302760463, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.9795637130737305, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8712266087532043, + "num_tokens": 220911626.0, + "step": 5789 + }, + { + "epoch": 0.7365475130390535, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.111762046813965, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.860346794128418, + "num_tokens": 220949839.0, + "step": 5790 + }, + { + "epoch": 0.736674723317644, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.1351730823516846, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8424962759017944, + "num_tokens": 220984069.0, + "step": 5791 + }, + { + "epoch": 0.7368019335962346, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.923742413520813, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8794634938240051, + "num_tokens": 221020999.0, + "step": 5792 + }, + { + "epoch": 0.7369291438748251, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.054518699645996, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8505203723907471, + "num_tokens": 221051082.0, + "step": 5793 + }, + { + "epoch": 0.7370563541534156, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8602734804153442, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8417853116989136, + "num_tokens": 221095444.0, + "step": 5794 + }, + { + "epoch": 0.737183564432006, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8801594972610474, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8619255423545837, + "num_tokens": 221136433.0, + "step": 5795 + }, + { + "epoch": 0.7373107747105966, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.004795789718628, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8519071936607361, + "num_tokens": 221171278.0, + "step": 5796 + }, + { + "epoch": 0.7374379849891871, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9657866954803467, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8589755296707153, + "num_tokens": 221209380.0, + "step": 5797 + }, + { + "epoch": 0.7375651952677776, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8939650058746338, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8570278286933899, + "num_tokens": 221253076.0, + "step": 5798 + }, + { + "epoch": 0.7376924055463682, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.048872232437134, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8547074794769287, + "num_tokens": 221290348.0, + "step": 5799 + }, + { + "epoch": 0.7378196158249587, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8354054689407349, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8655128479003906, + "num_tokens": 221329635.0, + "step": 5800 + }, + { + "epoch": 0.7379468261035491, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7510515451431274, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8670080900192261, + "num_tokens": 221368521.0, + "step": 5801 + }, + { + "epoch": 0.7380740363821396, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 16.591737747192383, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8705593347549438, + "num_tokens": 221407090.0, + "step": 5802 + }, + { + "epoch": 0.7382012466607302, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 3.0512900352478027, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8680423498153687, + "num_tokens": 221450340.0, + "step": 5803 + }, + { + "epoch": 0.7383284569393207, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9645088911056519, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8600631952285767, + "num_tokens": 221491996.0, + "step": 5804 + }, + { + "epoch": 0.7384556672179112, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.890866756439209, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8674803376197815, + "num_tokens": 221531466.0, + "step": 5805 + }, + { + "epoch": 0.7385828774965018, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.0740349292755127, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8410782217979431, + "num_tokens": 221567003.0, + "step": 5806 + }, + { + "epoch": 0.7387100877750922, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7882864475250244, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8715851306915283, + "num_tokens": 221607396.0, + "step": 5807 + }, + { + "epoch": 0.7388372980536827, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.982469081878662, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8530839681625366, + "num_tokens": 221644776.0, + "step": 5808 + }, + { + "epoch": 0.7389645083322732, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0077128410339355, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8719325661659241, + "num_tokens": 221685643.0, + "step": 5809 + }, + { + "epoch": 0.7390917186108638, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.976745843887329, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.852873682975769, + "num_tokens": 221724673.0, + "step": 5810 + }, + { + "epoch": 0.7392189288894543, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.34440541267395, + "learning_rate": 1e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.835541307926178, + "num_tokens": 221766287.0, + "step": 5811 + }, + { + "epoch": 0.7393461391680448, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9578723907470703, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8578252792358398, + "num_tokens": 221805357.0, + "step": 5812 + }, + { + "epoch": 0.7394733494466353, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0599679946899414, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8565405607223511, + "num_tokens": 221838513.0, + "step": 5813 + }, + { + "epoch": 0.7396005597252258, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8674509525299072, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8484207987785339, + "num_tokens": 221882808.0, + "step": 5814 + }, + { + "epoch": 0.7397277700038163, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.969748854637146, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8726972341537476, + "num_tokens": 221918505.0, + "step": 5815 + }, + { + "epoch": 0.7398549802824068, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9796454906463623, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8607931137084961, + "num_tokens": 221955983.0, + "step": 5816 + }, + { + "epoch": 0.7399821905609973, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.873723030090332, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8588625192642212, + "num_tokens": 221997363.0, + "step": 5817 + }, + { + "epoch": 0.7401094008395879, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.891895055770874, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8446784615516663, + "num_tokens": 222039302.0, + "step": 5818 + }, + { + "epoch": 0.7402366111181784, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8854262828826904, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.858330249786377, + "num_tokens": 222075779.0, + "step": 5819 + }, + { + "epoch": 0.7403638213967688, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8556976318359375, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8782821893692017, + "num_tokens": 222115120.0, + "step": 5820 + }, + { + "epoch": 0.7404910316753593, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9766570329666138, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8517913818359375, + "num_tokens": 222156570.0, + "step": 5821 + }, + { + "epoch": 0.7406182419539499, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.192007303237915, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8595147728919983, + "num_tokens": 222189487.0, + "step": 5822 + }, + { + "epoch": 0.7407454522325404, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0298266410827637, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8574900031089783, + "num_tokens": 222223163.0, + "step": 5823 + }, + { + "epoch": 0.7408726625111309, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.3908257484436035, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8713918924331665, + "num_tokens": 222259042.0, + "step": 5824 + }, + { + "epoch": 0.7409998727897215, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8944191932678223, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8408498764038086, + "num_tokens": 222300497.0, + "step": 5825 + }, + { + "epoch": 0.7411270830683119, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0450875759124756, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8559861183166504, + "num_tokens": 222333630.0, + "step": 5826 + }, + { + "epoch": 0.7412542933469024, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.925257921218872, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8546465039253235, + "num_tokens": 222371715.0, + "step": 5827 + }, + { + "epoch": 0.7413815036254929, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.899898648262024, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8593794107437134, + "num_tokens": 222412044.0, + "step": 5828 + }, + { + "epoch": 0.7415087139040835, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0419540405273438, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8610317707061768, + "num_tokens": 222452668.0, + "step": 5829 + }, + { + "epoch": 0.741635924182674, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9048534631729126, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8616844415664673, + "num_tokens": 222485631.0, + "step": 5830 + }, + { + "epoch": 0.7417631344612645, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.819205641746521, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8743944764137268, + "num_tokens": 222522967.0, + "step": 5831 + }, + { + "epoch": 0.7418903447398549, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9131006002426147, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8497660160064697, + "num_tokens": 222563258.0, + "step": 5832 + }, + { + "epoch": 0.7420175550184455, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 16.609790802001953, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8522515296936035, + "num_tokens": 222599248.0, + "step": 5833 + }, + { + "epoch": 0.742144765297036, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.0042128562927246, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8658624887466431, + "num_tokens": 222642575.0, + "step": 5834 + }, + { + "epoch": 0.7422719755756265, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8127331733703613, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8359082341194153, + "num_tokens": 222687330.0, + "step": 5835 + }, + { + "epoch": 0.742399185854217, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.1221561431884766, + "learning_rate": 1e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8343291282653809, + "num_tokens": 222720020.0, + "step": 5836 + }, + { + "epoch": 0.7425263961328076, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8146371841430664, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8717923164367676, + "num_tokens": 222758634.0, + "step": 5837 + }, + { + "epoch": 0.742653606411398, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8946566581726074, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8663122653961182, + "num_tokens": 222794823.0, + "step": 5838 + }, + { + "epoch": 0.7427808166899885, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.936040997505188, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8586910963058472, + "num_tokens": 222833836.0, + "step": 5839 + }, + { + "epoch": 0.742908026968579, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8123140335083008, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8610820770263672, + "num_tokens": 222875712.0, + "step": 5840 + }, + { + "epoch": 0.7430352372471696, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9077389240264893, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8479267358779907, + "num_tokens": 222915494.0, + "step": 5841 + }, + { + "epoch": 0.7431624475257601, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1747307777404785, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8751880526542664, + "num_tokens": 222951675.0, + "step": 5842 + }, + { + "epoch": 0.7432896578043506, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8041331768035889, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8545407652854919, + "num_tokens": 222993245.0, + "step": 5843 + }, + { + "epoch": 0.743416868082941, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.205475330352783, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8655625581741333, + "num_tokens": 223029876.0, + "step": 5844 + }, + { + "epoch": 0.7435440783615316, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8824081420898438, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8646489977836609, + "num_tokens": 223072550.0, + "step": 5845 + }, + { + "epoch": 0.7436712886401221, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0198185443878174, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8583260774612427, + "num_tokens": 223112892.0, + "step": 5846 + }, + { + "epoch": 0.7437984989187126, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.119736909866333, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8648451566696167, + "num_tokens": 223157093.0, + "step": 5847 + }, + { + "epoch": 0.7439257091973032, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0346086025238037, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8502721786499023, + "num_tokens": 223190756.0, + "step": 5848 + }, + { + "epoch": 0.7440529194758937, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9560048580169678, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8506210446357727, + "num_tokens": 223226013.0, + "step": 5849 + }, + { + "epoch": 0.7441801297544841, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.890088438987732, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8714023232460022, + "num_tokens": 223262858.0, + "step": 5850 + }, + { + "epoch": 0.7443073400330746, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8082964420318604, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8629205226898193, + "num_tokens": 223310528.0, + "step": 5851 + }, + { + "epoch": 0.7444345503116652, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9859380722045898, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8621731400489807, + "num_tokens": 223345021.0, + "step": 5852 + }, + { + "epoch": 0.7445617605902557, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7731822729110718, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8687127232551575, + "num_tokens": 223382055.0, + "step": 5853 + }, + { + "epoch": 0.7446889708688462, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8915205001831055, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8493281602859497, + "num_tokens": 223418441.0, + "step": 5854 + }, + { + "epoch": 0.7448161811474368, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9870474338531494, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8575255274772644, + "num_tokens": 223458150.0, + "step": 5855 + }, + { + "epoch": 0.7449433914260272, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7883282899856567, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8568837642669678, + "num_tokens": 223496916.0, + "step": 5856 + }, + { + "epoch": 0.7450706017046177, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8552765846252441, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8655891418457031, + "num_tokens": 223534808.0, + "step": 5857 + }, + { + "epoch": 0.7451978119832082, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.924100399017334, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8678959012031555, + "num_tokens": 223571255.0, + "step": 5858 + }, + { + "epoch": 0.7453250222617988, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7540720701217651, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8625460863113403, + "num_tokens": 223616608.0, + "step": 5859 + }, + { + "epoch": 0.7454522325403893, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.833458423614502, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8603008985519409, + "num_tokens": 223656188.0, + "step": 5860 + }, + { + "epoch": 0.7455794428189798, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8169969320297241, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8554083108901978, + "num_tokens": 223693787.0, + "step": 5861 + }, + { + "epoch": 0.7457066530975703, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0230295658111572, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.87166827917099, + "num_tokens": 223729350.0, + "step": 5862 + }, + { + "epoch": 0.7458338633761608, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7942957878112793, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8476272821426392, + "num_tokens": 223770846.0, + "step": 5863 + }, + { + "epoch": 0.7459610736547513, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.791195273399353, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8590192794799805, + "num_tokens": 223814194.0, + "step": 5864 + }, + { + "epoch": 0.7460882839333418, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.002127170562744, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8529641628265381, + "num_tokens": 223849141.0, + "step": 5865 + }, + { + "epoch": 0.7462154942119323, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.2876555919647217, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8810392618179321, + "num_tokens": 223888654.0, + "step": 5866 + }, + { + "epoch": 0.7463427044905229, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9674122333526611, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8622111678123474, + "num_tokens": 223924509.0, + "step": 5867 + }, + { + "epoch": 0.7464699147691134, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9425452947616577, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8454920053482056, + "num_tokens": 223966191.0, + "step": 5868 + }, + { + "epoch": 0.7465971250477038, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8161640167236328, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8618696928024292, + "num_tokens": 224001415.0, + "step": 5869 + }, + { + "epoch": 0.7467243353262943, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7837752103805542, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8665457367897034, + "num_tokens": 224042820.0, + "step": 5870 + }, + { + "epoch": 0.7468515456048849, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1559953689575195, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8650690913200378, + "num_tokens": 224078411.0, + "step": 5871 + }, + { + "epoch": 0.7469787558834754, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8038209676742554, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.866602897644043, + "num_tokens": 224117136.0, + "step": 5872 + }, + { + "epoch": 0.7471059661620659, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.901015281677246, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8527237176895142, + "num_tokens": 224155879.0, + "step": 5873 + }, + { + "epoch": 0.7472331764406565, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.976258635520935, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8699609637260437, + "num_tokens": 224195186.0, + "step": 5874 + }, + { + "epoch": 0.7473603867192469, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7829264402389526, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8564953804016113, + "num_tokens": 224235722.0, + "step": 5875 + }, + { + "epoch": 0.7474875969978374, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.0254149436950684, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8685899972915649, + "num_tokens": 224268461.0, + "step": 5876 + }, + { + "epoch": 0.7476148072764279, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9470133781433105, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8725606203079224, + "num_tokens": 224302921.0, + "step": 5877 + }, + { + "epoch": 0.7477420175550185, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7514950037002563, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8610481023788452, + "num_tokens": 224344081.0, + "step": 5878 + }, + { + "epoch": 0.747869227833609, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9254801273345947, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8478466868400574, + "num_tokens": 224384464.0, + "step": 5879 + }, + { + "epoch": 0.7479964381121995, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7475768327713013, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8677375912666321, + "num_tokens": 224425542.0, + "step": 5880 + }, + { + "epoch": 0.7481236483907899, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7039426565170288, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8767443895339966, + "num_tokens": 224465342.0, + "step": 5881 + }, + { + "epoch": 0.7482508586693805, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8217754364013672, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.864317774772644, + "num_tokens": 224504787.0, + "step": 5882 + }, + { + "epoch": 0.748378068947971, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9034615755081177, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8611019849777222, + "num_tokens": 224548534.0, + "step": 5883 + }, + { + "epoch": 0.7485052792265615, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8201028108596802, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8520152568817139, + "num_tokens": 224590251.0, + "step": 5884 + }, + { + "epoch": 0.748632489505152, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9039033651351929, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8699619770050049, + "num_tokens": 224633745.0, + "step": 5885 + }, + { + "epoch": 0.7487596997837426, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.040606737136841, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.844410240650177, + "num_tokens": 224671319.0, + "step": 5886 + }, + { + "epoch": 0.748886910062333, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8601815700531006, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8604393601417542, + "num_tokens": 224712670.0, + "step": 5887 + }, + { + "epoch": 0.7490141203409235, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8536982536315918, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8571643829345703, + "num_tokens": 224747840.0, + "step": 5888 + }, + { + "epoch": 0.749141330619514, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8125343322753906, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8709326982498169, + "num_tokens": 224783274.0, + "step": 5889 + }, + { + "epoch": 0.7492685408981046, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.117375373840332, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8555728793144226, + "num_tokens": 224820456.0, + "step": 5890 + }, + { + "epoch": 0.7493957511766951, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7848906517028809, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.864599347114563, + "num_tokens": 224858569.0, + "step": 5891 + }, + { + "epoch": 0.7495229614552856, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.948637843132019, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.85858553647995, + "num_tokens": 224901033.0, + "step": 5892 + }, + { + "epoch": 0.749650171733876, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.399925708770752, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8739821910858154, + "num_tokens": 224934345.0, + "step": 5893 + }, + { + "epoch": 0.7497773820124666, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8854042291641235, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8526538610458374, + "num_tokens": 224972479.0, + "step": 5894 + }, + { + "epoch": 0.7499045922910571, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8456772565841675, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.876142144203186, + "num_tokens": 225014213.0, + "step": 5895 + }, + { + "epoch": 0.7500318025696476, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8130563497543335, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8668094277381897, + "num_tokens": 225049929.0, + "step": 5896 + }, + { + "epoch": 0.7501590128482382, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8803014755249023, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8447455763816833, + "num_tokens": 225086409.0, + "step": 5897 + }, + { + "epoch": 0.7502862231268287, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8797035217285156, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.843077540397644, + "num_tokens": 225124443.0, + "step": 5898 + }, + { + "epoch": 0.7504134334054191, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9279639720916748, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8494372367858887, + "num_tokens": 225166665.0, + "step": 5899 + }, + { + "epoch": 0.7505406436840096, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9949607849121094, + "learning_rate": 1e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8352400064468384, + "num_tokens": 225203841.0, + "step": 5900 + }, + { + "epoch": 0.7506678539626002, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1902709007263184, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8432841300964355, + "num_tokens": 225234844.0, + "step": 5901 + }, + { + "epoch": 0.7507950642411907, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.0304665565490723, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8417026400566101, + "num_tokens": 225269828.0, + "step": 5902 + }, + { + "epoch": 0.7509222745197812, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7899110317230225, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8637893199920654, + "num_tokens": 225312361.0, + "step": 5903 + }, + { + "epoch": 0.7510494847983717, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9826185703277588, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8493047952651978, + "num_tokens": 225353708.0, + "step": 5904 + }, + { + "epoch": 0.7511766950769622, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7402249574661255, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8778849840164185, + "num_tokens": 225399730.0, + "step": 5905 + }, + { + "epoch": 0.7513039053555527, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8817309141159058, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8535563945770264, + "num_tokens": 225441836.0, + "step": 5906 + }, + { + "epoch": 0.7514311156341432, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.0136609077453613, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8664542436599731, + "num_tokens": 225471004.0, + "step": 5907 + }, + { + "epoch": 0.7515583259127337, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.062192440032959, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8419978618621826, + "num_tokens": 225505789.0, + "step": 5908 + }, + { + "epoch": 0.7516855361913243, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9096566438674927, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8613909482955933, + "num_tokens": 225541138.0, + "step": 5909 + }, + { + "epoch": 0.7518127464699148, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9683375358581543, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8632559776306152, + "num_tokens": 225581695.0, + "step": 5910 + }, + { + "epoch": 0.7519399567485053, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8857096433639526, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8656435012817383, + "num_tokens": 225619532.0, + "step": 5911 + }, + { + "epoch": 0.7520671670270958, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8761242628097534, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8514560461044312, + "num_tokens": 225657403.0, + "step": 5912 + }, + { + "epoch": 0.7521943773056863, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.911239743232727, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8547326326370239, + "num_tokens": 225693402.0, + "step": 5913 + }, + { + "epoch": 0.7523215875842768, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9419503211975098, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8582026958465576, + "num_tokens": 225727208.0, + "step": 5914 + }, + { + "epoch": 0.7524487978628673, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.106872320175171, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8635632991790771, + "num_tokens": 225762117.0, + "step": 5915 + }, + { + "epoch": 0.7525760081414579, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 4.03878116607666, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8447962999343872, + "num_tokens": 225794737.0, + "step": 5916 + }, + { + "epoch": 0.7527032184200484, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9866178035736084, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8588327169418335, + "num_tokens": 225829074.0, + "step": 5917 + }, + { + "epoch": 0.7528304286986388, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8853379487991333, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8731516599655151, + "num_tokens": 225871344.0, + "step": 5918 + }, + { + "epoch": 0.7529576389772293, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.905198574066162, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.855290412902832, + "num_tokens": 225911321.0, + "step": 5919 + }, + { + "epoch": 0.7530848492558199, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.147193431854248, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8679998517036438, + "num_tokens": 225944613.0, + "step": 5920 + }, + { + "epoch": 0.7532120595344104, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.3110432624816895, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8644266128540039, + "num_tokens": 225989501.0, + "step": 5921 + }, + { + "epoch": 0.7533392698130009, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9344887733459473, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8615315556526184, + "num_tokens": 226024984.0, + "step": 5922 + }, + { + "epoch": 0.7534664800915915, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.906801700592041, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8723192811012268, + "num_tokens": 226063255.0, + "step": 5923 + }, + { + "epoch": 0.7535936903701819, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.8499805927276611, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8474583625793457, + "num_tokens": 226105538.0, + "step": 5924 + }, + { + "epoch": 0.7537209006487724, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9836064577102661, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8626406192779541, + "num_tokens": 226137449.0, + "step": 5925 + }, + { + "epoch": 0.7538481109273629, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.045868396759033, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8530698418617249, + "num_tokens": 226171078.0, + "step": 5926 + }, + { + "epoch": 0.7539753212059535, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9266138076782227, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8618398904800415, + "num_tokens": 226208060.0, + "step": 5927 + }, + { + "epoch": 0.754102531484544, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8391904830932617, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8701204061508179, + "num_tokens": 226245330.0, + "step": 5928 + }, + { + "epoch": 0.7542297417631345, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9216934442520142, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8797557353973389, + "num_tokens": 226281974.0, + "step": 5929 + }, + { + "epoch": 0.7543569520417249, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7977535724639893, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8503152132034302, + "num_tokens": 226323302.0, + "step": 5930 + }, + { + "epoch": 0.7544841623203155, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9222787618637085, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8464586138725281, + "num_tokens": 226361346.0, + "step": 5931 + }, + { + "epoch": 0.754611372598906, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8897137641906738, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.85474693775177, + "num_tokens": 226397744.0, + "step": 5932 + }, + { + "epoch": 0.7547385828774965, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.0157949924468994, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8679389953613281, + "num_tokens": 226436745.0, + "step": 5933 + }, + { + "epoch": 0.754865793156087, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8791877031326294, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8665395975112915, + "num_tokens": 226470302.0, + "step": 5934 + }, + { + "epoch": 0.7549930034346776, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 7.815921306610107, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8615102767944336, + "num_tokens": 226500090.0, + "step": 5935 + }, + { + "epoch": 0.755120213713268, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.2082812786102295, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8614110946655273, + "num_tokens": 226533359.0, + "step": 5936 + }, + { + "epoch": 0.7552474239918585, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.946558952331543, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8512910604476929, + "num_tokens": 226567060.0, + "step": 5937 + }, + { + "epoch": 0.755374634270449, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9505313634872437, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8582665920257568, + "num_tokens": 226604270.0, + "step": 5938 + }, + { + "epoch": 0.7555018445490396, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.8913943767547607, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8572248220443726, + "num_tokens": 226641310.0, + "step": 5939 + }, + { + "epoch": 0.7556290548276301, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.965154767036438, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8490387797355652, + "num_tokens": 226678305.0, + "step": 5940 + }, + { + "epoch": 0.7557562651062206, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8451592922210693, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8673099279403687, + "num_tokens": 226715628.0, + "step": 5941 + }, + { + "epoch": 0.755883475384811, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 16.594953536987305, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8706623911857605, + "num_tokens": 226755346.0, + "step": 5942 + }, + { + "epoch": 0.7560106856634016, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 8.441242218017578, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8502687811851501, + "num_tokens": 226792243.0, + "step": 5943 + }, + { + "epoch": 0.7561378959419921, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.8503637313842773, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8736461997032166, + "num_tokens": 226829033.0, + "step": 5944 + }, + { + "epoch": 0.7562651062205826, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9831002950668335, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.874779999256134, + "num_tokens": 226870645.0, + "step": 5945 + }, + { + "epoch": 0.7563923164991732, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0703399181365967, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8641960620880127, + "num_tokens": 226902395.0, + "step": 5946 + }, + { + "epoch": 0.7565195267777637, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.8436106443405151, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8659757971763611, + "num_tokens": 226942620.0, + "step": 5947 + }, + { + "epoch": 0.7566467370563541, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9709681272506714, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8670071363449097, + "num_tokens": 226976472.0, + "step": 5948 + }, + { + "epoch": 0.7567739473349446, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8982899188995361, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8500192165374756, + "num_tokens": 227013626.0, + "step": 5949 + }, + { + "epoch": 0.7569011576135352, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.3527278900146484, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8477745056152344, + "num_tokens": 227049336.0, + "step": 5950 + }, + { + "epoch": 0.7570283678921257, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.019214630126953, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8822660446166992, + "num_tokens": 227087371.0, + "step": 5951 + }, + { + "epoch": 0.7571555781707162, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.775339126586914, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.86248379945755, + "num_tokens": 227129188.0, + "step": 5952 + }, + { + "epoch": 0.7572827884493067, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9507445096969604, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.853030800819397, + "num_tokens": 227161807.0, + "step": 5953 + }, + { + "epoch": 0.7574099987278972, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.775301218032837, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8513420820236206, + "num_tokens": 227198431.0, + "step": 5954 + }, + { + "epoch": 0.7575372090064877, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.785329818725586, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8789001107215881, + "num_tokens": 227236046.0, + "step": 5955 + }, + { + "epoch": 0.7576644192850782, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.798904299736023, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8756937980651855, + "num_tokens": 227272925.0, + "step": 5956 + }, + { + "epoch": 0.7577916295636687, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.7818790674209595, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8650480508804321, + "num_tokens": 227312311.0, + "step": 5957 + }, + { + "epoch": 0.7579188398422593, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.9310919046401978, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8801404237747192, + "num_tokens": 227349632.0, + "step": 5958 + }, + { + "epoch": 0.7580460501208498, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7699553966522217, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8597812652587891, + "num_tokens": 227392982.0, + "step": 5959 + }, + { + "epoch": 0.7581732603994403, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.9251371622085571, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8565025329589844, + "num_tokens": 227431878.0, + "step": 5960 + }, + { + "epoch": 0.7583004706780307, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.8182125091552734, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8504042625427246, + "num_tokens": 227470401.0, + "step": 5961 + }, + { + "epoch": 0.7584276809566213, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9291130304336548, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8649741411209106, + "num_tokens": 227507189.0, + "step": 5962 + }, + { + "epoch": 0.7585548912352118, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8415642976760864, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.864350438117981, + "num_tokens": 227548070.0, + "step": 5963 + }, + { + "epoch": 0.7586821015138023, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.7081027030944824, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8659946918487549, + "num_tokens": 227584360.0, + "step": 5964 + }, + { + "epoch": 0.7588093117923929, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9411296844482422, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8666598796844482, + "num_tokens": 227618961.0, + "step": 5965 + }, + { + "epoch": 0.7589365220709834, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8496301174163818, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8619036078453064, + "num_tokens": 227658304.0, + "step": 5966 + }, + { + "epoch": 0.7590637323495738, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.6741503477096558, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8581172227859497, + "num_tokens": 227701526.0, + "step": 5967 + }, + { + "epoch": 0.7591909426281643, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.722346305847168, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.866054117679596, + "num_tokens": 227744930.0, + "step": 5968 + }, + { + "epoch": 0.7593181529067549, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8751602172851562, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8711689710617065, + "num_tokens": 227780607.0, + "step": 5969 + }, + { + "epoch": 0.7594453631853454, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.8213800191879272, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8573703765869141, + "num_tokens": 227825742.0, + "step": 5970 + }, + { + "epoch": 0.7595725734639359, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.7923252582550049, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8411092162132263, + "num_tokens": 227865278.0, + "step": 5971 + }, + { + "epoch": 0.7596997837425264, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.8307743072509766, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8574074506759644, + "num_tokens": 227906912.0, + "step": 5972 + }, + { + "epoch": 0.7598269940211169, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.7207378149032593, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8585148453712463, + "num_tokens": 227947067.0, + "step": 5973 + }, + { + "epoch": 0.7599542042997074, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.294286012649536, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8427339792251587, + "num_tokens": 227988563.0, + "step": 5974 + }, + { + "epoch": 0.7600814145782979, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9621093273162842, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8545477390289307, + "num_tokens": 228028556.0, + "step": 5975 + }, + { + "epoch": 0.7602086248568884, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.065730571746826, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8748037815093994, + "num_tokens": 228066265.0, + "step": 5976 + }, + { + "epoch": 0.760335835135479, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.7754557132720947, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8582233786582947, + "num_tokens": 228103732.0, + "step": 5977 + }, + { + "epoch": 0.7604630454140695, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.7901939153671265, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.858203649520874, + "num_tokens": 228149609.0, + "step": 5978 + }, + { + "epoch": 0.7605902556926599, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8621641397476196, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8626525402069092, + "num_tokens": 228188462.0, + "step": 5979 + }, + { + "epoch": 0.7607174659712505, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.819946527481079, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8748327493667603, + "num_tokens": 228229252.0, + "step": 5980 + }, + { + "epoch": 0.760844676249841, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9684611558914185, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.857634425163269, + "num_tokens": 228267397.0, + "step": 5981 + }, + { + "epoch": 0.7609718865284315, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.7353792190551758, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8685060739517212, + "num_tokens": 228307561.0, + "step": 5982 + }, + { + "epoch": 0.761099096807022, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 1.9262585639953613, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.861539363861084, + "num_tokens": 228347667.0, + "step": 5983 + }, + { + "epoch": 0.7612263070856126, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.884671688079834, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8586455583572388, + "num_tokens": 228386820.0, + "step": 5984 + }, + { + "epoch": 0.761353517364203, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.7693806886672974, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8697056770324707, + "num_tokens": 228427304.0, + "step": 5985 + }, + { + "epoch": 0.7614807276427935, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.7745752334594727, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8511910438537598, + "num_tokens": 228470779.0, + "step": 5986 + }, + { + "epoch": 0.761607937921384, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.0079567432403564, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8766719102859497, + "num_tokens": 228500454.0, + "step": 5987 + }, + { + "epoch": 0.7617351481999746, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 16.611982345581055, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8676344156265259, + "num_tokens": 228537374.0, + "step": 5988 + }, + { + "epoch": 0.7618623584785651, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 2.053929090499878, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8609344959259033, + "num_tokens": 228577215.0, + "step": 5989 + }, + { + "epoch": 0.7619895687571556, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.029967784881592, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.865654468536377, + "num_tokens": 228611693.0, + "step": 5990 + }, + { + "epoch": 0.762116779035746, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.068681478500366, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8603699803352356, + "num_tokens": 228644231.0, + "step": 5991 + }, + { + "epoch": 0.7622439893143366, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.6328684091567993, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.872198760509491, + "num_tokens": 228684776.0, + "step": 5992 + }, + { + "epoch": 0.7623711995929271, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.677045226097107, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8667901158332825, + "num_tokens": 228724214.0, + "step": 5993 + }, + { + "epoch": 0.7624984098715176, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8490321636199951, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8600788116455078, + "num_tokens": 228765098.0, + "step": 5994 + }, + { + "epoch": 0.7626256201501082, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.7383780479431152, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8620177507400513, + "num_tokens": 228802423.0, + "step": 5995 + }, + { + "epoch": 0.7627528304286987, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.580286979675293, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8518256545066833, + "num_tokens": 228840218.0, + "step": 5996 + }, + { + "epoch": 0.7628800407072891, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.7325491905212402, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.866532564163208, + "num_tokens": 228882810.0, + "step": 5997 + }, + { + "epoch": 0.7630072509858796, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.058293104171753, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8676061630249023, + "num_tokens": 228917664.0, + "step": 5998 + }, + { + "epoch": 0.7631344612644702, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.971505880355835, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8687235116958618, + "num_tokens": 228951017.0, + "step": 5999 + }, + { + "epoch": 0.7632616715430607, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8085596561431885, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8624172210693359, + "num_tokens": 228990949.0, + "step": 6000 + }, + { + "epoch": 0.7633888818216512, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8476922512054443, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8664097189903259, + "num_tokens": 229026604.0, + "step": 6001 + }, + { + "epoch": 0.7635160921002417, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.009307622909546, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8563088774681091, + "num_tokens": 229061810.0, + "step": 6002 + }, + { + "epoch": 0.7636433023788322, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.7644399404525757, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8381894826889038, + "num_tokens": 229107765.0, + "step": 6003 + }, + { + "epoch": 0.7637705126574227, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.8213950395584106, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8488569855690002, + "num_tokens": 229147822.0, + "step": 6004 + }, + { + "epoch": 0.7638977229360132, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8567043542861938, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8704061508178711, + "num_tokens": 229184421.0, + "step": 6005 + }, + { + "epoch": 0.7640249332146037, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.8340164422988892, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8621615767478943, + "num_tokens": 229221737.0, + "step": 6006 + }, + { + "epoch": 0.7641521434931943, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.9904773235321045, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.855255126953125, + "num_tokens": 229254727.0, + "step": 6007 + }, + { + "epoch": 0.7642793537717848, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.913153886795044, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8720825910568237, + "num_tokens": 229292648.0, + "step": 6008 + }, + { + "epoch": 0.7644065640503753, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.8278061151504517, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8659040927886963, + "num_tokens": 229335708.0, + "step": 6009 + }, + { + "epoch": 0.7645337743289657, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.861775279045105, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8689978718757629, + "num_tokens": 229373791.0, + "step": 6010 + }, + { + "epoch": 0.7646609846075563, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8504852056503296, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.870275616645813, + "num_tokens": 229409891.0, + "step": 6011 + }, + { + "epoch": 0.7647881948861468, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.8648329973220825, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8629865646362305, + "num_tokens": 229449859.0, + "step": 6012 + }, + { + "epoch": 0.7649154051647373, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.0307631492614746, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8678033351898193, + "num_tokens": 229484224.0, + "step": 6013 + }, + { + "epoch": 0.7650426154433279, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.987292766571045, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8556944727897644, + "num_tokens": 229521104.0, + "step": 6014 + }, + { + "epoch": 0.7651698257219184, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 1.9810097217559814, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8453798890113831, + "num_tokens": 229558918.0, + "step": 6015 + }, + { + "epoch": 0.7652970360005088, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.884223222732544, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8480472564697266, + "num_tokens": 229596931.0, + "step": 6016 + }, + { + "epoch": 0.7654242462790993, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.9075205326080322, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8459935784339905, + "num_tokens": 229633903.0, + "step": 6017 + }, + { + "epoch": 0.7655514565576899, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8607382774353027, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8584822416305542, + "num_tokens": 229674224.0, + "step": 6018 + }, + { + "epoch": 0.7656786668362804, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.8247630596160889, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8595072031021118, + "num_tokens": 229710547.0, + "step": 6019 + }, + { + "epoch": 0.7658058771148709, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0239787101745605, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8756650686264038, + "num_tokens": 229752020.0, + "step": 6020 + }, + { + "epoch": 0.7659330873934614, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0236921310424805, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8583002090454102, + "num_tokens": 229785710.0, + "step": 6021 + }, + { + "epoch": 0.7660602976720519, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.833423137664795, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8685389757156372, + "num_tokens": 229824118.0, + "step": 6022 + }, + { + "epoch": 0.7661875079506424, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9647897481918335, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8736777305603027, + "num_tokens": 229854231.0, + "step": 6023 + }, + { + "epoch": 0.7663147182292329, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8345873355865479, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8517354726791382, + "num_tokens": 229888102.0, + "step": 6024 + }, + { + "epoch": 0.7664419285078234, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0104942321777344, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.865234375, + "num_tokens": 229927634.0, + "step": 6025 + }, + { + "epoch": 0.766569138786414, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8755759000778198, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8690686225891113, + "num_tokens": 229966526.0, + "step": 6026 + }, + { + "epoch": 0.7666963490650045, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.893349051475525, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8708940744400024, + "num_tokens": 230002761.0, + "step": 6027 + }, + { + "epoch": 0.7668235593435949, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8443747758865356, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8666194677352905, + "num_tokens": 230041303.0, + "step": 6028 + }, + { + "epoch": 0.7669507696221854, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.7685701847076416, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8660284876823425, + "num_tokens": 230081117.0, + "step": 6029 + }, + { + "epoch": 0.767077979900776, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9149972200393677, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8559249639511108, + "num_tokens": 230119411.0, + "step": 6030 + }, + { + "epoch": 0.7672051901793665, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.880165696144104, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8683538436889648, + "num_tokens": 230161111.0, + "step": 6031 + }, + { + "epoch": 0.767332400457957, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0623128414154053, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8580375909805298, + "num_tokens": 230196999.0, + "step": 6032 + }, + { + "epoch": 0.7674596107365476, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9681222438812256, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8638041019439697, + "num_tokens": 230231495.0, + "step": 6033 + }, + { + "epoch": 0.767586821015138, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0901598930358887, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8543136715888977, + "num_tokens": 230267732.0, + "step": 6034 + }, + { + "epoch": 0.7677140312937285, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.1139142513275146, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8598315715789795, + "num_tokens": 230297559.0, + "step": 6035 + }, + { + "epoch": 0.767841241572319, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9140838384628296, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8496643900871277, + "num_tokens": 230336396.0, + "step": 6036 + }, + { + "epoch": 0.7679684518509096, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9579510688781738, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8628337383270264, + "num_tokens": 230368152.0, + "step": 6037 + }, + { + "epoch": 0.7680956621295001, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9086343050003052, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8674749135971069, + "num_tokens": 230408081.0, + "step": 6038 + }, + { + "epoch": 0.7682228724080906, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0084781646728516, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8550277352333069, + "num_tokens": 230441691.0, + "step": 6039 + }, + { + "epoch": 0.768350082686681, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.7240492105484009, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8594086170196533, + "num_tokens": 230480089.0, + "step": 6040 + }, + { + "epoch": 0.7684772929652716, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.886889100074768, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8564776182174683, + "num_tokens": 230518655.0, + "step": 6041 + }, + { + "epoch": 0.7686045032438621, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.014200210571289, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8650913834571838, + "num_tokens": 230555717.0, + "step": 6042 + }, + { + "epoch": 0.7687317135224526, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7472655773162842, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8693947792053223, + "num_tokens": 230596699.0, + "step": 6043 + }, + { + "epoch": 0.7688589238010431, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8303219079971313, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8696388602256775, + "num_tokens": 230633284.0, + "step": 6044 + }, + { + "epoch": 0.7689861340796337, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.7444276809692383, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8643079996109009, + "num_tokens": 230673133.0, + "step": 6045 + }, + { + "epoch": 0.7691133443582241, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.773727297782898, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.862115740776062, + "num_tokens": 230719003.0, + "step": 6046 + }, + { + "epoch": 0.7692405546368146, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.7960034608840942, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.871465265750885, + "num_tokens": 230754805.0, + "step": 6047 + }, + { + "epoch": 0.7693677649154052, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8836796283721924, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8478947877883911, + "num_tokens": 230793249.0, + "step": 6048 + }, + { + "epoch": 0.7694949751939957, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8927831649780273, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8780767917633057, + "num_tokens": 230826025.0, + "step": 6049 + }, + { + "epoch": 0.7696221854725862, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8126300573349, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8729735016822815, + "num_tokens": 230866615.0, + "step": 6050 + }, + { + "epoch": 0.7697493957511767, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.0349833965301514, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8464540243148804, + "num_tokens": 230901587.0, + "step": 6051 + }, + { + "epoch": 0.7698766060297672, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.767313838005066, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8669142127037048, + "num_tokens": 230944267.0, + "step": 6052 + }, + { + "epoch": 0.7700038163083577, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9809746742248535, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8529669046401978, + "num_tokens": 230980022.0, + "step": 6053 + }, + { + "epoch": 0.7701310265869482, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.796942114830017, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8687880039215088, + "num_tokens": 231018110.0, + "step": 6054 + }, + { + "epoch": 0.7702582368655387, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.6919506788253784, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.866308331489563, + "num_tokens": 231056531.0, + "step": 6055 + }, + { + "epoch": 0.7703854471441293, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.7708401679992676, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8664641380310059, + "num_tokens": 231096270.0, + "step": 6056 + }, + { + "epoch": 0.7705126574227198, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9090685844421387, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8548848032951355, + "num_tokens": 231136300.0, + "step": 6057 + }, + { + "epoch": 0.7706398677013102, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9598288536071777, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8489007353782654, + "num_tokens": 231169005.0, + "step": 6058 + }, + { + "epoch": 0.7707670779799007, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.917161464691162, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.867685854434967, + "num_tokens": 231205274.0, + "step": 6059 + }, + { + "epoch": 0.7708942882584913, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8661009073257446, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8676002025604248, + "num_tokens": 231239022.0, + "step": 6060 + }, + { + "epoch": 0.7710214985370818, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8982850313186646, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8436049222946167, + "num_tokens": 231282952.0, + "step": 6061 + }, + { + "epoch": 0.7711487088156723, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.862344741821289, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8590496778488159, + "num_tokens": 231322980.0, + "step": 6062 + }, + { + "epoch": 0.7712759190942629, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0611236095428467, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8539243936538696, + "num_tokens": 231358382.0, + "step": 6063 + }, + { + "epoch": 0.7714031293728534, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7232609987258911, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.869800329208374, + "num_tokens": 231401915.0, + "step": 6064 + }, + { + "epoch": 0.7715303396514438, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.068195104598999, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.839381992816925, + "num_tokens": 231433241.0, + "step": 6065 + }, + { + "epoch": 0.7716575499300343, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8504414558410645, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8571288585662842, + "num_tokens": 231478318.0, + "step": 6066 + }, + { + "epoch": 0.7717847602086249, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8556081056594849, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8530081510543823, + "num_tokens": 231518584.0, + "step": 6067 + }, + { + "epoch": 0.7719119704872154, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9099475145339966, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8620625138282776, + "num_tokens": 231556689.0, + "step": 6068 + }, + { + "epoch": 0.7720391807658059, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9304770231246948, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8607857823371887, + "num_tokens": 231593936.0, + "step": 6069 + }, + { + "epoch": 0.7721663910443964, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.779623031616211, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8804306983947754, + "num_tokens": 231634785.0, + "step": 6070 + }, + { + "epoch": 0.7722936013229869, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8484681844711304, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8568046689033508, + "num_tokens": 231675798.0, + "step": 6071 + }, + { + "epoch": 0.7724208116015774, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.871824026107788, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8584504723548889, + "num_tokens": 231711611.0, + "step": 6072 + }, + { + "epoch": 0.7725480218801679, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8787068128585815, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.851440966129303, + "num_tokens": 231746997.0, + "step": 6073 + }, + { + "epoch": 0.7726752321587584, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.044102191925049, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8409063816070557, + "num_tokens": 231787696.0, + "step": 6074 + }, + { + "epoch": 0.772802442437349, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9331951141357422, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8531649112701416, + "num_tokens": 231826127.0, + "step": 6075 + }, + { + "epoch": 0.7729296527159395, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.0054502487182617, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8559408187866211, + "num_tokens": 231857538.0, + "step": 6076 + }, + { + "epoch": 0.7730568629945299, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.0175788402557373, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8594156503677368, + "num_tokens": 231893913.0, + "step": 6077 + }, + { + "epoch": 0.7731840732731204, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.848961353302002, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8635470271110535, + "num_tokens": 231934274.0, + "step": 6078 + }, + { + "epoch": 0.773311283551711, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.701557993888855, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8544639945030212, + "num_tokens": 231983197.0, + "step": 6079 + }, + { + "epoch": 0.7734384938303015, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8440897464752197, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8553375005722046, + "num_tokens": 232021265.0, + "step": 6080 + }, + { + "epoch": 0.773565704108892, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9097468852996826, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8593557476997375, + "num_tokens": 232060992.0, + "step": 6081 + }, + { + "epoch": 0.7736929143874826, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8526583909988403, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8581503629684448, + "num_tokens": 232096758.0, + "step": 6082 + }, + { + "epoch": 0.773820124666073, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8734835386276245, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8571696877479553, + "num_tokens": 232133773.0, + "step": 6083 + }, + { + "epoch": 0.7739473349446635, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7386181354522705, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8591462969779968, + "num_tokens": 232173182.0, + "step": 6084 + }, + { + "epoch": 0.774074545223254, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9384326934814453, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8675205707550049, + "num_tokens": 232206478.0, + "step": 6085 + }, + { + "epoch": 0.7742017555018446, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.03252911567688, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8547145128250122, + "num_tokens": 232242231.0, + "step": 6086 + }, + { + "epoch": 0.7743289657804351, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.095790147781372, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.863801121711731, + "num_tokens": 232270636.0, + "step": 6087 + }, + { + "epoch": 0.7744561760590256, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.7522826194763184, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8653885722160339, + "num_tokens": 232309974.0, + "step": 6088 + }, + { + "epoch": 0.774583386337616, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.0431976318359375, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8705083131790161, + "num_tokens": 232344118.0, + "step": 6089 + }, + { + "epoch": 0.7747105966162066, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8007969856262207, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8506898283958435, + "num_tokens": 232385183.0, + "step": 6090 + }, + { + "epoch": 0.7748378068947971, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8240852355957031, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8615932464599609, + "num_tokens": 232421670.0, + "step": 6091 + }, + { + "epoch": 0.7749650171733876, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.772609829902649, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8491345643997192, + "num_tokens": 232459563.0, + "step": 6092 + }, + { + "epoch": 0.7750922274519781, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.159569501876831, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8607404232025146, + "num_tokens": 232496357.0, + "step": 6093 + }, + { + "epoch": 0.7752194377305687, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.196356773376465, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8616248369216919, + "num_tokens": 232530436.0, + "step": 6094 + }, + { + "epoch": 0.7753466480091591, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8371665477752686, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8548799753189087, + "num_tokens": 232568585.0, + "step": 6095 + }, + { + "epoch": 0.7754738582877496, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8393797874450684, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8666965961456299, + "num_tokens": 232602506.0, + "step": 6096 + }, + { + "epoch": 0.7756010685663401, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9088937044143677, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8693163394927979, + "num_tokens": 232642819.0, + "step": 6097 + }, + { + "epoch": 0.7757282788449307, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9383434057235718, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8478841781616211, + "num_tokens": 232679505.0, + "step": 6098 + }, + { + "epoch": 0.7758554891235212, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9101130962371826, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8657174110412598, + "num_tokens": 232715701.0, + "step": 6099 + }, + { + "epoch": 0.7759826994021117, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.0021579265594482, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8619678616523743, + "num_tokens": 232750786.0, + "step": 6100 + }, + { + "epoch": 0.7761099096807021, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7519781589508057, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8506810069084167, + "num_tokens": 232791370.0, + "step": 6101 + }, + { + "epoch": 0.7762371199592927, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0291147232055664, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8640204071998596, + "num_tokens": 232830895.0, + "step": 6102 + }, + { + "epoch": 0.7763643302378832, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.1513853073120117, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8595899939537048, + "num_tokens": 232869940.0, + "step": 6103 + }, + { + "epoch": 0.7764915405164737, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8588218688964844, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8553391098976135, + "num_tokens": 232910743.0, + "step": 6104 + }, + { + "epoch": 0.7766187507950643, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.0091867446899414, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8597665429115295, + "num_tokens": 232948428.0, + "step": 6105 + }, + { + "epoch": 0.7767459610736548, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8446412086486816, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8681303858757019, + "num_tokens": 232985142.0, + "step": 6106 + }, + { + "epoch": 0.7768731713522452, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8326396942138672, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.870646595954895, + "num_tokens": 233020810.0, + "step": 6107 + }, + { + "epoch": 0.7770003816308357, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9035766124725342, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8528046011924744, + "num_tokens": 233055674.0, + "step": 6108 + }, + { + "epoch": 0.7771275919094263, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.078216552734375, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8556458950042725, + "num_tokens": 233084506.0, + "step": 6109 + }, + { + "epoch": 0.7772548021880168, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.6999502182006836, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.877871036529541, + "num_tokens": 233120172.0, + "step": 6110 + }, + { + "epoch": 0.7773820124666073, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.952486276626587, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8504677414894104, + "num_tokens": 233160075.0, + "step": 6111 + }, + { + "epoch": 0.7775092227451978, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.7530279159545898, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8532716035842896, + "num_tokens": 233201714.0, + "step": 6112 + }, + { + "epoch": 0.7776364330237884, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.0113320350646973, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8673031330108643, + "num_tokens": 233237642.0, + "step": 6113 + }, + { + "epoch": 0.7777636433023788, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.751201868057251, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8662458062171936, + "num_tokens": 233279712.0, + "step": 6114 + }, + { + "epoch": 0.7778908535809693, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.6997568607330322, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8653082251548767, + "num_tokens": 233322441.0, + "step": 6115 + }, + { + "epoch": 0.7780180638595598, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.6818989515304565, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8552470207214355, + "num_tokens": 233371053.0, + "step": 6116 + }, + { + "epoch": 0.7781452741381504, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9305311441421509, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8624486327171326, + "num_tokens": 233405372.0, + "step": 6117 + }, + { + "epoch": 0.7782724844167409, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.7191225290298462, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8673909902572632, + "num_tokens": 233447209.0, + "step": 6118 + }, + { + "epoch": 0.7783996946953314, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.7689820528030396, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8642503023147583, + "num_tokens": 233484621.0, + "step": 6119 + }, + { + "epoch": 0.7785269049739219, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.0627434253692627, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8450473546981812, + "num_tokens": 233520756.0, + "step": 6120 + }, + { + "epoch": 0.7786541152525124, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8878421783447266, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8525618314743042, + "num_tokens": 233559120.0, + "step": 6121 + }, + { + "epoch": 0.7787813255311029, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.108689785003662, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8713006973266602, + "num_tokens": 233590513.0, + "step": 6122 + }, + { + "epoch": 0.7789085358096934, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.9507821798324585, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8764944672584534, + "num_tokens": 233625986.0, + "step": 6123 + }, + { + "epoch": 0.779035746088284, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.0002284049987793, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8690626621246338, + "num_tokens": 233655097.0, + "step": 6124 + }, + { + "epoch": 0.7791629563668745, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9614815711975098, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8515290021896362, + "num_tokens": 233687815.0, + "step": 6125 + }, + { + "epoch": 0.7792901666454649, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9083962440490723, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8748401999473572, + "num_tokens": 233722611.0, + "step": 6126 + }, + { + "epoch": 0.7794173769240554, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8626641035079956, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8485010266304016, + "num_tokens": 233765345.0, + "step": 6127 + }, + { + "epoch": 0.779544587202646, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9818415641784668, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8621203303337097, + "num_tokens": 233804358.0, + "step": 6128 + }, + { + "epoch": 0.7796717974812365, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8849488496780396, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8505744338035583, + "num_tokens": 233845117.0, + "step": 6129 + }, + { + "epoch": 0.779799007759827, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8875240087509155, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8541042804718018, + "num_tokens": 233884091.0, + "step": 6130 + }, + { + "epoch": 0.7799262180384176, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.918962836265564, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8588317036628723, + "num_tokens": 233922590.0, + "step": 6131 + }, + { + "epoch": 0.780053428317008, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.962659478187561, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8635231256484985, + "num_tokens": 233954138.0, + "step": 6132 + }, + { + "epoch": 0.7801806385955985, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8582555055618286, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8697340488433838, + "num_tokens": 233988035.0, + "step": 6133 + }, + { + "epoch": 0.780307848874189, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.874386191368103, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8506247997283936, + "num_tokens": 234023747.0, + "step": 6134 + }, + { + "epoch": 0.7804350591527796, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.7309900522232056, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8703882098197937, + "num_tokens": 234064182.0, + "step": 6135 + }, + { + "epoch": 0.7805622694313701, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.002429962158203, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8444205522537231, + "num_tokens": 234106533.0, + "step": 6136 + }, + { + "epoch": 0.7806894797099606, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7602311372756958, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8732407093048096, + "num_tokens": 234145492.0, + "step": 6137 + }, + { + "epoch": 0.780816689988551, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.090317487716675, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8640966415405273, + "num_tokens": 234181709.0, + "step": 6138 + }, + { + "epoch": 0.7809439002671416, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8770962953567505, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8616126775741577, + "num_tokens": 234217854.0, + "step": 6139 + }, + { + "epoch": 0.7810711105457321, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8160797357559204, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8617290258407593, + "num_tokens": 234255769.0, + "step": 6140 + }, + { + "epoch": 0.7811983208243226, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.124037981033325, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8472343683242798, + "num_tokens": 234290828.0, + "step": 6141 + }, + { + "epoch": 0.7813255311029131, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8356947898864746, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8603711128234863, + "num_tokens": 234326336.0, + "step": 6142 + }, + { + "epoch": 0.7814527413815037, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8628345727920532, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8565388917922974, + "num_tokens": 234366210.0, + "step": 6143 + }, + { + "epoch": 0.7815799516600941, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.0117897987365723, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8523022532463074, + "num_tokens": 234398975.0, + "step": 6144 + }, + { + "epoch": 0.7817071619386846, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9356509447097778, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.854866623878479, + "num_tokens": 234437746.0, + "step": 6145 + }, + { + "epoch": 0.7818343722172751, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.717240333557129, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8661008477210999, + "num_tokens": 234478402.0, + "step": 6146 + }, + { + "epoch": 0.7819615824958657, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.0441372394561768, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8453276753425598, + "num_tokens": 234509699.0, + "step": 6147 + }, + { + "epoch": 0.7820887927744562, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7464826107025146, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8555408120155334, + "num_tokens": 234551515.0, + "step": 6148 + }, + { + "epoch": 0.7822160030530467, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.5616588592529297, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8677942752838135, + "num_tokens": 234582811.0, + "step": 6149 + }, + { + "epoch": 0.7823432133316371, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7722976207733154, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8525897860527039, + "num_tokens": 234625299.0, + "step": 6150 + }, + { + "epoch": 0.7824704236102277, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8353890180587769, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.865395724773407, + "num_tokens": 234665641.0, + "step": 6151 + }, + { + "epoch": 0.7825976338888182, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9386979341506958, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8590832948684692, + "num_tokens": 234704668.0, + "step": 6152 + }, + { + "epoch": 0.7827248441674087, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9343314170837402, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.864393949508667, + "num_tokens": 234738080.0, + "step": 6153 + }, + { + "epoch": 0.7828520544459993, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.754314661026001, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8583686351776123, + "num_tokens": 234781077.0, + "step": 6154 + }, + { + "epoch": 0.7829792647245898, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.123701333999634, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8459514379501343, + "num_tokens": 234809374.0, + "step": 6155 + }, + { + "epoch": 0.7831064750031802, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9283615350723267, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8486503958702087, + "num_tokens": 234845585.0, + "step": 6156 + }, + { + "epoch": 0.7832336852817707, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.7773423194885254, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8595062494277954, + "num_tokens": 234883891.0, + "step": 6157 + }, + { + "epoch": 0.7833608955603613, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9135801792144775, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8604614734649658, + "num_tokens": 234919445.0, + "step": 6158 + }, + { + "epoch": 0.7834881058389518, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8635337352752686, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8633289337158203, + "num_tokens": 234951465.0, + "step": 6159 + }, + { + "epoch": 0.7836153161175423, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8745217323303223, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8529213666915894, + "num_tokens": 234989970.0, + "step": 6160 + }, + { + "epoch": 0.7837425263961328, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9733856916427612, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.861797034740448, + "num_tokens": 235029673.0, + "step": 6161 + }, + { + "epoch": 0.7838697366747234, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9035643339157104, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8597707748413086, + "num_tokens": 235065389.0, + "step": 6162 + }, + { + "epoch": 0.7839969469533138, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0408060550689697, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8719845414161682, + "num_tokens": 235099795.0, + "step": 6163 + }, + { + "epoch": 0.7841241572319043, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9625556468963623, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8613965511322021, + "num_tokens": 235137475.0, + "step": 6164 + }, + { + "epoch": 0.7842513675104948, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.012183427810669, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8558705449104309, + "num_tokens": 235178454.0, + "step": 6165 + }, + { + "epoch": 0.7843785777890854, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9050171375274658, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8781274557113647, + "num_tokens": 235216139.0, + "step": 6166 + }, + { + "epoch": 0.7845057880676759, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7476568222045898, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8752661943435669, + "num_tokens": 235253571.0, + "step": 6167 + }, + { + "epoch": 0.7846329983462664, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8514600992202759, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8448461294174194, + "num_tokens": 235293950.0, + "step": 6168 + }, + { + "epoch": 0.7847602086248568, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8359839916229248, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8650046586990356, + "num_tokens": 235336419.0, + "step": 6169 + }, + { + "epoch": 0.7848874189034474, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.8879567384719849, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8644759654998779, + "num_tokens": 235376039.0, + "step": 6170 + }, + { + "epoch": 0.7850146291820379, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8981201648712158, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.849197506904602, + "num_tokens": 235412736.0, + "step": 6171 + }, + { + "epoch": 0.7851418394606284, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.7135978937149048, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8607258796691895, + "num_tokens": 235457318.0, + "step": 6172 + }, + { + "epoch": 0.785269049739219, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.929877758026123, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.849800705909729, + "num_tokens": 235491471.0, + "step": 6173 + }, + { + "epoch": 0.7853962600178095, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8770644664764404, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.871326208114624, + "num_tokens": 235531324.0, + "step": 6174 + }, + { + "epoch": 0.7855234702963999, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.984015703201294, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8492896556854248, + "num_tokens": 235573233.0, + "step": 6175 + }, + { + "epoch": 0.7856506805749904, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7207964658737183, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8618147969245911, + "num_tokens": 235614400.0, + "step": 6176 + }, + { + "epoch": 0.785777890853581, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8143318891525269, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8673852682113647, + "num_tokens": 235652247.0, + "step": 6177 + }, + { + "epoch": 0.7859051011321715, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.1256892681121826, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8564441204071045, + "num_tokens": 235681810.0, + "step": 6178 + }, + { + "epoch": 0.786032311410762, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0032219886779785, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8691114783287048, + "num_tokens": 235718422.0, + "step": 6179 + }, + { + "epoch": 0.7861595216893525, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8801074028015137, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8563472032546997, + "num_tokens": 235758886.0, + "step": 6180 + }, + { + "epoch": 0.786286731967943, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0163075923919678, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8641300201416016, + "num_tokens": 235792270.0, + "step": 6181 + }, + { + "epoch": 0.7864139422465335, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.045135974884033, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8502551317214966, + "num_tokens": 235826678.0, + "step": 6182 + }, + { + "epoch": 0.786541152525124, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9412155151367188, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.872317910194397, + "num_tokens": 235860437.0, + "step": 6183 + }, + { + "epoch": 0.7866683628037145, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8032504320144653, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8661260008811951, + "num_tokens": 235900865.0, + "step": 6184 + }, + { + "epoch": 0.7867955730823051, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0394134521484375, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8470361232757568, + "num_tokens": 235937419.0, + "step": 6185 + }, + { + "epoch": 0.7869227833608956, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.893608570098877, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8501513600349426, + "num_tokens": 235976425.0, + "step": 6186 + }, + { + "epoch": 0.787049993639486, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.951256513595581, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8551570773124695, + "num_tokens": 236012968.0, + "step": 6187 + }, + { + "epoch": 0.7871772039180766, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.912779450416565, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8623104095458984, + "num_tokens": 236052462.0, + "step": 6188 + }, + { + "epoch": 0.7873044141966671, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9405583143234253, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8757537007331848, + "num_tokens": 236087604.0, + "step": 6189 + }, + { + "epoch": 0.7874316244752576, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8366535902023315, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8591793775558472, + "num_tokens": 236126344.0, + "step": 6190 + }, + { + "epoch": 0.7875588347538481, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9684267044067383, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.868037223815918, + "num_tokens": 236161913.0, + "step": 6191 + }, + { + "epoch": 0.7876860450324387, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8942985534667969, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8640197515487671, + "num_tokens": 236200612.0, + "step": 6192 + }, + { + "epoch": 0.7878132553110291, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9129436016082764, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8670211434364319, + "num_tokens": 236238136.0, + "step": 6193 + }, + { + "epoch": 0.7879404655896196, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8774380683898926, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8579552173614502, + "num_tokens": 236281303.0, + "step": 6194 + }, + { + "epoch": 0.7880676758682101, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0582923889160156, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8565092086791992, + "num_tokens": 236322355.0, + "step": 6195 + }, + { + "epoch": 0.7881948861468007, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9133602380752563, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.859468936920166, + "num_tokens": 236354263.0, + "step": 6196 + }, + { + "epoch": 0.7883220964253912, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7600566148757935, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8633257150650024, + "num_tokens": 236391581.0, + "step": 6197 + }, + { + "epoch": 0.7884493067039817, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9013665914535522, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8695123791694641, + "num_tokens": 236426003.0, + "step": 6198 + }, + { + "epoch": 0.7885765169825721, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9662415981292725, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8703687191009521, + "num_tokens": 236464874.0, + "step": 6199 + }, + { + "epoch": 0.7887037272611627, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9326212406158447, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8726010322570801, + "num_tokens": 236499826.0, + "step": 6200 + }, + { + "epoch": 0.7888309375397532, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.821363925933838, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8725277185440063, + "num_tokens": 236538615.0, + "step": 6201 + }, + { + "epoch": 0.7889581478183437, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.926202416419983, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8589543104171753, + "num_tokens": 236577860.0, + "step": 6202 + }, + { + "epoch": 0.7890853580969343, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.7842706441879272, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.875386118888855, + "num_tokens": 236619041.0, + "step": 6203 + }, + { + "epoch": 0.7892125683755248, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9493294954299927, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8643988370895386, + "num_tokens": 236655788.0, + "step": 6204 + }, + { + "epoch": 0.7893397786541152, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.867156982421875, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8728484511375427, + "num_tokens": 236691532.0, + "step": 6205 + }, + { + "epoch": 0.7894669889327057, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8559237718582153, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8675060272216797, + "num_tokens": 236733792.0, + "step": 6206 + }, + { + "epoch": 0.7895941992112963, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.6719183921813965, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8741430044174194, + "num_tokens": 236775454.0, + "step": 6207 + }, + { + "epoch": 0.7897214094898868, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.3547961711883545, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8705503940582275, + "num_tokens": 236811572.0, + "step": 6208 + }, + { + "epoch": 0.7898486197684773, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.930449366569519, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8461036682128906, + "num_tokens": 236851018.0, + "step": 6209 + }, + { + "epoch": 0.7899758300470678, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7463665008544922, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8659217953681946, + "num_tokens": 236891871.0, + "step": 6210 + }, + { + "epoch": 0.7901030403256584, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9268790483474731, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8653180599212646, + "num_tokens": 236926771.0, + "step": 6211 + }, + { + "epoch": 0.7902302506042488, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8982740640640259, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8490573763847351, + "num_tokens": 236964768.0, + "step": 6212 + }, + { + "epoch": 0.7903574608828393, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8699086904525757, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8487600088119507, + "num_tokens": 237005075.0, + "step": 6213 + }, + { + "epoch": 0.7904846711614298, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.013019323348999, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8630070686340332, + "num_tokens": 237035517.0, + "step": 6214 + }, + { + "epoch": 0.7906118814400204, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7196966409683228, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8783769607543945, + "num_tokens": 237076319.0, + "step": 6215 + }, + { + "epoch": 0.7907390917186109, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8030050992965698, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8527805805206299, + "num_tokens": 237117935.0, + "step": 6216 + }, + { + "epoch": 0.7908663019972014, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.7879542112350464, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8711172342300415, + "num_tokens": 237155789.0, + "step": 6217 + }, + { + "epoch": 0.7909935122757918, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.6818304061889648, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8672590851783752, + "num_tokens": 237198870.0, + "step": 6218 + }, + { + "epoch": 0.7911207225543824, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8351184129714966, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8674976825714111, + "num_tokens": 237243084.0, + "step": 6219 + }, + { + "epoch": 0.7912479328329729, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.0852198600769043, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.85247403383255, + "num_tokens": 237277245.0, + "step": 6220 + }, + { + "epoch": 0.7913751431115634, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8183658123016357, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8673087954521179, + "num_tokens": 237317841.0, + "step": 6221 + }, + { + "epoch": 0.791502353390154, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.1412434577941895, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8515317440032959, + "num_tokens": 237349327.0, + "step": 6222 + }, + { + "epoch": 0.7916295636687445, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9047290086746216, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8469167947769165, + "num_tokens": 237396179.0, + "step": 6223 + }, + { + "epoch": 0.7917567739473349, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8602665662765503, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8585931062698364, + "num_tokens": 237437766.0, + "step": 6224 + }, + { + "epoch": 0.7918839842259254, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8174651861190796, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8658406734466553, + "num_tokens": 237479239.0, + "step": 6225 + }, + { + "epoch": 0.792011194504516, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9767159223556519, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8758131265640259, + "num_tokens": 237514641.0, + "step": 6226 + }, + { + "epoch": 0.7921384047831065, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7216095924377441, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8716208934783936, + "num_tokens": 237555319.0, + "step": 6227 + }, + { + "epoch": 0.792265615061697, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.708736538887024, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8638302087783813, + "num_tokens": 237597786.0, + "step": 6228 + }, + { + "epoch": 0.7923928253402875, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8345891237258911, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8730993270874023, + "num_tokens": 237636584.0, + "step": 6229 + }, + { + "epoch": 0.792520035618878, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9008312225341797, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8533828854560852, + "num_tokens": 237676000.0, + "step": 6230 + }, + { + "epoch": 0.7926472458974685, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8544942140579224, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.887243926525116, + "num_tokens": 237713003.0, + "step": 6231 + }, + { + "epoch": 0.792774456176059, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.064286470413208, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8631464838981628, + "num_tokens": 237742054.0, + "step": 6232 + }, + { + "epoch": 0.7929016664546495, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.944182276725769, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.853127121925354, + "num_tokens": 237783147.0, + "step": 6233 + }, + { + "epoch": 0.7930288767332401, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.084944248199463, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.856877326965332, + "num_tokens": 237814322.0, + "step": 6234 + }, + { + "epoch": 0.7931560870118306, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8616902828216553, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8660379648208618, + "num_tokens": 237849278.0, + "step": 6235 + }, + { + "epoch": 0.793283297290421, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.4737508296966553, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8652878403663635, + "num_tokens": 237893434.0, + "step": 6236 + }, + { + "epoch": 0.7934105075690115, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.806694507598877, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8696126341819763, + "num_tokens": 237932135.0, + "step": 6237 + }, + { + "epoch": 0.7935377178476021, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.951856017112732, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8562414646148682, + "num_tokens": 237966320.0, + "step": 6238 + }, + { + "epoch": 0.7936649281261926, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.865749478340149, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8663496375083923, + "num_tokens": 238006365.0, + "step": 6239 + }, + { + "epoch": 0.7937921384047831, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9230772256851196, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8580456376075745, + "num_tokens": 238039307.0, + "step": 6240 + }, + { + "epoch": 0.7939193486833737, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9434608221054077, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8475608825683594, + "num_tokens": 238078015.0, + "step": 6241 + }, + { + "epoch": 0.7940465589619641, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.94192636013031, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8595874905586243, + "num_tokens": 238116790.0, + "step": 6242 + }, + { + "epoch": 0.7941737692405546, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8591748476028442, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8673374652862549, + "num_tokens": 238155330.0, + "step": 6243 + }, + { + "epoch": 0.7943009795191451, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8229323625564575, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.845450758934021, + "num_tokens": 238194492.0, + "step": 6244 + }, + { + "epoch": 0.7944281897977357, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8870309591293335, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8611398935317993, + "num_tokens": 238230451.0, + "step": 6245 + }, + { + "epoch": 0.7945554000763262, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.017237901687622, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8689916133880615, + "num_tokens": 238266331.0, + "step": 6246 + }, + { + "epoch": 0.7946826103549167, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8559391498565674, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8509184718132019, + "num_tokens": 238307753.0, + "step": 6247 + }, + { + "epoch": 0.7948098206335071, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.7954834699630737, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8616183996200562, + "num_tokens": 238349644.0, + "step": 6248 + }, + { + "epoch": 0.7949370309120977, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.9641213417053223, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8481043577194214, + "num_tokens": 238386308.0, + "step": 6249 + }, + { + "epoch": 0.7950642411906882, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.8952007293701172, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8470268249511719, + "num_tokens": 238427935.0, + "step": 6250 + }, + { + "epoch": 0.7951914514692787, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.9024498462677002, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8675432205200195, + "num_tokens": 238463639.0, + "step": 6251 + }, + { + "epoch": 0.7953186617478692, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8683570623397827, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8753451704978943, + "num_tokens": 238498295.0, + "step": 6252 + }, + { + "epoch": 0.7954458720264598, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7797526121139526, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8617860078811646, + "num_tokens": 238538109.0, + "step": 6253 + }, + { + "epoch": 0.7955730823050502, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0319552421569824, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8476015329360962, + "num_tokens": 238574415.0, + "step": 6254 + }, + { + "epoch": 0.7957002925836407, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8568487167358398, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8639203310012817, + "num_tokens": 238616368.0, + "step": 6255 + }, + { + "epoch": 0.7958275028622313, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.919115662574768, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8655880689620972, + "num_tokens": 238650634.0, + "step": 6256 + }, + { + "epoch": 0.7959547131408218, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9250503778457642, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8561806678771973, + "num_tokens": 238684383.0, + "step": 6257 + }, + { + "epoch": 0.7960819234194123, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.7890207767486572, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8581686019897461, + "num_tokens": 238722863.0, + "step": 6258 + }, + { + "epoch": 0.7962091336980028, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7298206090927124, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8626182079315186, + "num_tokens": 238765027.0, + "step": 6259 + }, + { + "epoch": 0.7963363439765934, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.146644115447998, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8667160272598267, + "num_tokens": 238801950.0, + "step": 6260 + }, + { + "epoch": 0.7964635542551838, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8219588994979858, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.853442907333374, + "num_tokens": 238840963.0, + "step": 6261 + }, + { + "epoch": 0.7965907645337743, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9081135988235474, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8492329120635986, + "num_tokens": 238876167.0, + "step": 6262 + }, + { + "epoch": 0.7967179748123648, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8298205137252808, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8669875264167786, + "num_tokens": 238910293.0, + "step": 6263 + }, + { + "epoch": 0.7968451850909554, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9121195077896118, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8595408201217651, + "num_tokens": 238947279.0, + "step": 6264 + }, + { + "epoch": 0.7969723953695459, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.006321907043457, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.850893497467041, + "num_tokens": 238984148.0, + "step": 6265 + }, + { + "epoch": 0.7970996056481364, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.755511999130249, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8485658764839172, + "num_tokens": 239022370.0, + "step": 6266 + }, + { + "epoch": 0.7972268159267268, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.759606957435608, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8582527041435242, + "num_tokens": 239064542.0, + "step": 6267 + }, + { + "epoch": 0.7973540262053174, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.871216893196106, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8595985174179077, + "num_tokens": 239103193.0, + "step": 6268 + }, + { + "epoch": 0.7974812364839079, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8733588457107544, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8695206046104431, + "num_tokens": 239137952.0, + "step": 6269 + }, + { + "epoch": 0.7976084467624984, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.842607021331787, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8518476486206055, + "num_tokens": 239174600.0, + "step": 6270 + }, + { + "epoch": 0.797735657041089, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.976388931274414, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8653915524482727, + "num_tokens": 239206157.0, + "step": 6271 + }, + { + "epoch": 0.7978628673196795, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.82457435131073, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8667182922363281, + "num_tokens": 239243094.0, + "step": 6272 + }, + { + "epoch": 0.7979900775982699, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7774872779846191, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8616651296615601, + "num_tokens": 239281208.0, + "step": 6273 + }, + { + "epoch": 0.7981172878768604, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8479000329971313, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.857785701751709, + "num_tokens": 239320159.0, + "step": 6274 + }, + { + "epoch": 0.798244498155451, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8218177556991577, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8485978841781616, + "num_tokens": 239360160.0, + "step": 6275 + }, + { + "epoch": 0.7983717084340415, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.2934916019439697, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8633980751037598, + "num_tokens": 239398101.0, + "step": 6276 + }, + { + "epoch": 0.798498918712632, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7746638059616089, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8762546181678772, + "num_tokens": 239438560.0, + "step": 6277 + }, + { + "epoch": 0.7986261289912225, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.803960919380188, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8568251132965088, + "num_tokens": 239475851.0, + "step": 6278 + }, + { + "epoch": 0.798753339269813, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7617546319961548, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8660086393356323, + "num_tokens": 239519165.0, + "step": 6279 + }, + { + "epoch": 0.7988805495484035, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.114351987838745, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8486164808273315, + "num_tokens": 239560602.0, + "step": 6280 + }, + { + "epoch": 0.799007759826994, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.028353452682495, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8621448278427124, + "num_tokens": 239591933.0, + "step": 6281 + }, + { + "epoch": 0.7991349701055845, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9710984230041504, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8566073179244995, + "num_tokens": 239625495.0, + "step": 6282 + }, + { + "epoch": 0.7992621803841751, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9468986988067627, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8779778480529785, + "num_tokens": 239658249.0, + "step": 6283 + }, + { + "epoch": 0.7993893906627656, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.1367299556732178, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8522067070007324, + "num_tokens": 239690194.0, + "step": 6284 + }, + { + "epoch": 0.799516600941356, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.1801726818084717, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8603173494338989, + "num_tokens": 239728315.0, + "step": 6285 + }, + { + "epoch": 0.7996438112199465, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9497610330581665, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8646686673164368, + "num_tokens": 239764505.0, + "step": 6286 + }, + { + "epoch": 0.7997710214985371, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7262330055236816, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8454362154006958, + "num_tokens": 239809461.0, + "step": 6287 + }, + { + "epoch": 0.7998982317771276, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0801877975463867, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8556320071220398, + "num_tokens": 239845232.0, + "step": 6288 + }, + { + "epoch": 0.8000254420557181, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7533817291259766, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8672005534172058, + "num_tokens": 239885489.0, + "step": 6289 + }, + { + "epoch": 0.8001526523343087, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9936243295669556, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8474504947662354, + "num_tokens": 239921159.0, + "step": 6290 + }, + { + "epoch": 0.8002798626128991, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7920256853103638, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8659567832946777, + "num_tokens": 239960879.0, + "step": 6291 + }, + { + "epoch": 0.8004070728914896, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.059645652770996, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8510370254516602, + "num_tokens": 239994302.0, + "step": 6292 + }, + { + "epoch": 0.8005342831700801, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9222780466079712, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8678479790687561, + "num_tokens": 240027759.0, + "step": 6293 + }, + { + "epoch": 0.8006614934486707, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8917150497436523, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.855434000492096, + "num_tokens": 240065049.0, + "step": 6294 + }, + { + "epoch": 0.8007887037272612, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0752289295196533, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.846156120300293, + "num_tokens": 240101453.0, + "step": 6295 + }, + { + "epoch": 0.8009159140058517, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.114471197128296, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8395147919654846, + "num_tokens": 240136831.0, + "step": 6296 + }, + { + "epoch": 0.8010431242844421, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.4110777378082275, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.859606146812439, + "num_tokens": 240173364.0, + "step": 6297 + }, + { + "epoch": 0.8011703345630327, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.226046085357666, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.839715301990509, + "num_tokens": 240206718.0, + "step": 6298 + }, + { + "epoch": 0.8012975448416232, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8520365953445435, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8497059941291809, + "num_tokens": 240245633.0, + "step": 6299 + }, + { + "epoch": 0.8014247551202137, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8072974681854248, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8741002082824707, + "num_tokens": 240283887.0, + "step": 6300 + }, + { + "epoch": 0.8015519653988042, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8076813220977783, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8520993590354919, + "num_tokens": 240322208.0, + "step": 6301 + }, + { + "epoch": 0.8016791756773948, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8958337306976318, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8639144897460938, + "num_tokens": 240356154.0, + "step": 6302 + }, + { + "epoch": 0.8018063859559852, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8512569665908813, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8599673509597778, + "num_tokens": 240389103.0, + "step": 6303 + }, + { + "epoch": 0.8019335962345757, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8174189329147339, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8780831098556519, + "num_tokens": 240429545.0, + "step": 6304 + }, + { + "epoch": 0.8020608065131662, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.74359130859375, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8778742551803589, + "num_tokens": 240468556.0, + "step": 6305 + }, + { + "epoch": 0.8021880167917568, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7882165908813477, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8631095886230469, + "num_tokens": 240508534.0, + "step": 6306 + }, + { + "epoch": 0.8023152270703473, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0506348609924316, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8607197999954224, + "num_tokens": 240540833.0, + "step": 6307 + }, + { + "epoch": 0.8024424373489378, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.1655030250549316, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8524186015129089, + "num_tokens": 240577768.0, + "step": 6308 + }, + { + "epoch": 0.8025696476275284, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.027484893798828, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8568898439407349, + "num_tokens": 240610607.0, + "step": 6309 + }, + { + "epoch": 0.8026968579061188, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9469608068466187, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8573595285415649, + "num_tokens": 240646056.0, + "step": 6310 + }, + { + "epoch": 0.8028240681847093, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8488941192626953, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.86830073595047, + "num_tokens": 240682513.0, + "step": 6311 + }, + { + "epoch": 0.8029512784632998, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0146396160125732, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8605481386184692, + "num_tokens": 240716311.0, + "step": 6312 + }, + { + "epoch": 0.8030784887418904, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.011021137237549, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8508365750312805, + "num_tokens": 240758873.0, + "step": 6313 + }, + { + "epoch": 0.8032056990204809, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8756054639816284, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8660991787910461, + "num_tokens": 240797163.0, + "step": 6314 + }, + { + "epoch": 0.8033329092990714, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7442110776901245, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8718007802963257, + "num_tokens": 240835719.0, + "step": 6315 + }, + { + "epoch": 0.8034601195776618, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8781424760818481, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8536601066589355, + "num_tokens": 240874359.0, + "step": 6316 + }, + { + "epoch": 0.8035873298562524, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8433383703231812, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8733169436454773, + "num_tokens": 240909125.0, + "step": 6317 + }, + { + "epoch": 0.8037145401348429, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8654565811157227, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8643457889556885, + "num_tokens": 240949391.0, + "step": 6318 + }, + { + "epoch": 0.8038417504134334, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9714999198913574, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8513540029525757, + "num_tokens": 240987390.0, + "step": 6319 + }, + { + "epoch": 0.803968960692024, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8993618488311768, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8539320230484009, + "num_tokens": 241025359.0, + "step": 6320 + }, + { + "epoch": 0.8040961709706145, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.071420431137085, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8552361726760864, + "num_tokens": 241060532.0, + "step": 6321 + }, + { + "epoch": 0.8042233812492049, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7912061214447021, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8661689758300781, + "num_tokens": 241098547.0, + "step": 6322 + }, + { + "epoch": 0.8043505915277954, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9894306659698486, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8604953289031982, + "num_tokens": 241131091.0, + "step": 6323 + }, + { + "epoch": 0.804477801806386, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0144846439361572, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8495621681213379, + "num_tokens": 241164495.0, + "step": 6324 + }, + { + "epoch": 0.8046050120849765, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8762491941452026, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8549220561981201, + "num_tokens": 241203777.0, + "step": 6325 + }, + { + "epoch": 0.804732222363567, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8684163093566895, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8542938232421875, + "num_tokens": 241243219.0, + "step": 6326 + }, + { + "epoch": 0.8048594326421575, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7935887575149536, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8449931144714355, + "num_tokens": 241284547.0, + "step": 6327 + }, + { + "epoch": 0.804986642920748, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8924176692962646, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8437617421150208, + "num_tokens": 241331110.0, + "step": 6328 + }, + { + "epoch": 0.8051138531993385, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7148159742355347, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8605204820632935, + "num_tokens": 241372720.0, + "step": 6329 + }, + { + "epoch": 0.805241063477929, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8179794549942017, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8626940250396729, + "num_tokens": 241408173.0, + "step": 6330 + }, + { + "epoch": 0.8053682737565195, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8228046894073486, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8670111894607544, + "num_tokens": 241445021.0, + "step": 6331 + }, + { + "epoch": 0.8054954840351101, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8482797145843506, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8649018406867981, + "num_tokens": 241481874.0, + "step": 6332 + }, + { + "epoch": 0.8056226943137006, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.86030113697052, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8450279235839844, + "num_tokens": 241523140.0, + "step": 6333 + }, + { + "epoch": 0.805749904592291, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8947064876556396, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8420794010162354, + "num_tokens": 241563424.0, + "step": 6334 + }, + { + "epoch": 0.8058771148708815, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9398561716079712, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8724387288093567, + "num_tokens": 241600004.0, + "step": 6335 + }, + { + "epoch": 0.8060043251494721, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8605037927627563, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8585678339004517, + "num_tokens": 241642704.0, + "step": 6336 + }, + { + "epoch": 0.8061315354280626, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.886362075805664, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8641088008880615, + "num_tokens": 241679176.0, + "step": 6337 + }, + { + "epoch": 0.8062587457066531, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7377253770828247, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.865075945854187, + "num_tokens": 241721168.0, + "step": 6338 + }, + { + "epoch": 0.8063859559852437, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7610090970993042, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8742946982383728, + "num_tokens": 241759735.0, + "step": 6339 + }, + { + "epoch": 0.8065131662638341, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8608123064041138, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.861255407333374, + "num_tokens": 241799445.0, + "step": 6340 + }, + { + "epoch": 0.8066403765424246, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9635591506958008, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8653216361999512, + "num_tokens": 241839333.0, + "step": 6341 + }, + { + "epoch": 0.8067675868210151, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.898840069770813, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8479808568954468, + "num_tokens": 241880248.0, + "step": 6342 + }, + { + "epoch": 0.8068947970996057, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7367899417877197, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8733965158462524, + "num_tokens": 241926717.0, + "step": 6343 + }, + { + "epoch": 0.8070220073781962, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.874849557876587, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.852238655090332, + "num_tokens": 241966080.0, + "step": 6344 + }, + { + "epoch": 0.8071492176567867, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8366413116455078, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8573337197303772, + "num_tokens": 242011975.0, + "step": 6345 + }, + { + "epoch": 0.8072764279353771, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9212015867233276, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8734436631202698, + "num_tokens": 242046320.0, + "step": 6346 + }, + { + "epoch": 0.8074036382139677, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8940629959106445, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8613578081130981, + "num_tokens": 242085157.0, + "step": 6347 + }, + { + "epoch": 0.8075308484925582, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.1025869846343994, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.857320249080658, + "num_tokens": 242116548.0, + "step": 6348 + }, + { + "epoch": 0.8076580587711487, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.060748338699341, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.866174042224884, + "num_tokens": 242151883.0, + "step": 6349 + }, + { + "epoch": 0.8077852690497392, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9820010662078857, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8577025532722473, + "num_tokens": 242190486.0, + "step": 6350 + }, + { + "epoch": 0.8079124793283298, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9119206666946411, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8647880554199219, + "num_tokens": 242232720.0, + "step": 6351 + }, + { + "epoch": 0.8080396896069202, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.021416425704956, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8615765571594238, + "num_tokens": 242273926.0, + "step": 6352 + }, + { + "epoch": 0.8081668998855107, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8430434465408325, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8514399528503418, + "num_tokens": 242313484.0, + "step": 6353 + }, + { + "epoch": 0.8082941101641012, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.950918197631836, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.85554438829422, + "num_tokens": 242352101.0, + "step": 6354 + }, + { + "epoch": 0.8084213204426918, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7381092309951782, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8756431341171265, + "num_tokens": 242392958.0, + "step": 6355 + }, + { + "epoch": 0.8085485307212823, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.853066325187683, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8621933460235596, + "num_tokens": 242432327.0, + "step": 6356 + }, + { + "epoch": 0.8086757409998728, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8883843421936035, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8395315408706665, + "num_tokens": 242473714.0, + "step": 6357 + }, + { + "epoch": 0.8088029512784632, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9709985256195068, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8679653406143188, + "num_tokens": 242513179.0, + "step": 6358 + }, + { + "epoch": 0.8089301615570538, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8856308460235596, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8600549697875977, + "num_tokens": 242549398.0, + "step": 6359 + }, + { + "epoch": 0.8090573718356443, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9346295595169067, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8560057878494263, + "num_tokens": 242583705.0, + "step": 6360 + }, + { + "epoch": 0.8091845821142348, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8068304061889648, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8665041923522949, + "num_tokens": 242625900.0, + "step": 6361 + }, + { + "epoch": 0.8093117923928254, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.743901252746582, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8501062393188477, + "num_tokens": 242665766.0, + "step": 6362 + }, + { + "epoch": 0.8094390026714159, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.882810115814209, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8634775876998901, + "num_tokens": 242707634.0, + "step": 6363 + }, + { + "epoch": 0.8095662129500064, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.957896113395691, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8566433787345886, + "num_tokens": 242746791.0, + "step": 6364 + }, + { + "epoch": 0.8096934232285968, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7577110528945923, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8737993240356445, + "num_tokens": 242784861.0, + "step": 6365 + }, + { + "epoch": 0.8098206335071874, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.849130630493164, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8746289014816284, + "num_tokens": 242817612.0, + "step": 6366 + }, + { + "epoch": 0.8099478437857779, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7548686265945435, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8671895265579224, + "num_tokens": 242854951.0, + "step": 6367 + }, + { + "epoch": 0.8100750540643684, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9513376951217651, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8762116432189941, + "num_tokens": 242894166.0, + "step": 6368 + }, + { + "epoch": 0.8102022643429589, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8320438861846924, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8666410446166992, + "num_tokens": 242936230.0, + "step": 6369 + }, + { + "epoch": 0.8103294746215495, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 7.764511585235596, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8716103434562683, + "num_tokens": 242973148.0, + "step": 6370 + }, + { + "epoch": 0.8104566849001399, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0632495880126953, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8624871969223022, + "num_tokens": 243013029.0, + "step": 6371 + }, + { + "epoch": 0.8105838951787304, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9138466119766235, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8607462048530579, + "num_tokens": 243054847.0, + "step": 6372 + }, + { + "epoch": 0.810711105457321, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8330459594726562, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8540595769882202, + "num_tokens": 243096524.0, + "step": 6373 + }, + { + "epoch": 0.8108383157359115, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.004063367843628, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.852687656879425, + "num_tokens": 243132397.0, + "step": 6374 + }, + { + "epoch": 0.810965526014502, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.753032922744751, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8477882742881775, + "num_tokens": 243171790.0, + "step": 6375 + }, + { + "epoch": 0.8110927362930925, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9051899909973145, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.859600841999054, + "num_tokens": 243208217.0, + "step": 6376 + }, + { + "epoch": 0.811219946571683, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0067737102508545, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8656713962554932, + "num_tokens": 243245260.0, + "step": 6377 + }, + { + "epoch": 0.8113471568502735, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.837193489074707, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8683562278747559, + "num_tokens": 243283924.0, + "step": 6378 + }, + { + "epoch": 0.811474367128864, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7972384691238403, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8697443008422852, + "num_tokens": 243321977.0, + "step": 6379 + }, + { + "epoch": 0.8116015774074545, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.8896435499191284, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8577189445495605, + "num_tokens": 243359349.0, + "step": 6380 + }, + { + "epoch": 0.8117287876860451, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8545361757278442, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8644052743911743, + "num_tokens": 243395722.0, + "step": 6381 + }, + { + "epoch": 0.8118559979646356, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.092313051223755, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8495891094207764, + "num_tokens": 243429022.0, + "step": 6382 + }, + { + "epoch": 0.811983208243226, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9495714902877808, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8399362564086914, + "num_tokens": 243468474.0, + "step": 6383 + }, + { + "epoch": 0.8121104185218165, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 7.781181812286377, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8762117624282837, + "num_tokens": 243503045.0, + "step": 6384 + }, + { + "epoch": 0.8122376288004071, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0123581886291504, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8499805927276611, + "num_tokens": 243543865.0, + "step": 6385 + }, + { + "epoch": 0.8123648390789976, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9343161582946777, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8534929752349854, + "num_tokens": 243581258.0, + "step": 6386 + }, + { + "epoch": 0.8124920493575881, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7885621786117554, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8545342087745667, + "num_tokens": 243622777.0, + "step": 6387 + }, + { + "epoch": 0.8126192596361786, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9269628524780273, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.849025309085846, + "num_tokens": 243664549.0, + "step": 6388 + }, + { + "epoch": 0.8127464699147691, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7504074573516846, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8553693294525146, + "num_tokens": 243710408.0, + "step": 6389 + }, + { + "epoch": 0.8128736801933596, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7210544347763062, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8623684644699097, + "num_tokens": 243752598.0, + "step": 6390 + }, + { + "epoch": 0.8130008904719501, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8493608236312866, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.858567476272583, + "num_tokens": 243789805.0, + "step": 6391 + }, + { + "epoch": 0.8131281007505406, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0493366718292236, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8698503375053406, + "num_tokens": 243824124.0, + "step": 6392 + }, + { + "epoch": 0.8132553110291312, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9211558103561401, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8593501448631287, + "num_tokens": 243863448.0, + "step": 6393 + }, + { + "epoch": 0.8133825213077217, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7882941961288452, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8639152646064758, + "num_tokens": 243902085.0, + "step": 6394 + }, + { + "epoch": 0.8135097315863121, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 16.597158432006836, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8547921776771545, + "num_tokens": 243945349.0, + "step": 6395 + }, + { + "epoch": 0.8136369418649027, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.019993782043457, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8571734428405762, + "num_tokens": 243982221.0, + "step": 6396 + }, + { + "epoch": 0.8137641521434932, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.109412670135498, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8478273749351501, + "num_tokens": 244012576.0, + "step": 6397 + }, + { + "epoch": 0.8138913624220837, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.972099781036377, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8673408627510071, + "num_tokens": 244052089.0, + "step": 6398 + }, + { + "epoch": 0.8140185727006742, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9510471820831299, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8624461889266968, + "num_tokens": 244088712.0, + "step": 6399 + }, + { + "epoch": 0.8141457829792648, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7619467973709106, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8618955612182617, + "num_tokens": 244127655.0, + "step": 6400 + }, + { + "epoch": 0.8142729932578552, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9252108335494995, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8655421137809753, + "num_tokens": 244158588.0, + "step": 6401 + }, + { + "epoch": 0.8144002035364457, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8358087539672852, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8791605234146118, + "num_tokens": 244193872.0, + "step": 6402 + }, + { + "epoch": 0.8145274138150362, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.892903208732605, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8449593186378479, + "num_tokens": 244234055.0, + "step": 6403 + }, + { + "epoch": 0.8146546240936268, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.810305118560791, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8606215715408325, + "num_tokens": 244275163.0, + "step": 6404 + }, + { + "epoch": 0.8147818343722173, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7890608310699463, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8578586578369141, + "num_tokens": 244315241.0, + "step": 6405 + }, + { + "epoch": 0.8149090446508078, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9823737144470215, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8758690357208252, + "num_tokens": 244351947.0, + "step": 6406 + }, + { + "epoch": 0.8150362549293982, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.011786699295044, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8586087226867676, + "num_tokens": 244387706.0, + "step": 6407 + }, + { + "epoch": 0.8151634652079888, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.79585599899292, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8680474162101746, + "num_tokens": 244428447.0, + "step": 6408 + }, + { + "epoch": 0.8152906754865793, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8812576532363892, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8460944294929504, + "num_tokens": 244471191.0, + "step": 6409 + }, + { + "epoch": 0.8154178857651698, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.958396077156067, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8423839807510376, + "num_tokens": 244509198.0, + "step": 6410 + }, + { + "epoch": 0.8155450960437604, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9435096979141235, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8511053323745728, + "num_tokens": 244549677.0, + "step": 6411 + }, + { + "epoch": 0.8156723063223509, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.030297040939331, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8477573394775391, + "num_tokens": 244583586.0, + "step": 6412 + }, + { + "epoch": 0.8157995166009414, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9234930276870728, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8689620494842529, + "num_tokens": 244621220.0, + "step": 6413 + }, + { + "epoch": 0.8159267268795318, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9589502811431885, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8682301044464111, + "num_tokens": 244655941.0, + "step": 6414 + }, + { + "epoch": 0.8160539371581224, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9264720678329468, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8774970769882202, + "num_tokens": 244685807.0, + "step": 6415 + }, + { + "epoch": 0.8161811474367129, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.771787405014038, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8592792749404907, + "num_tokens": 244726866.0, + "step": 6416 + }, + { + "epoch": 0.8163083577153034, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8302801847457886, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8514251708984375, + "num_tokens": 244767273.0, + "step": 6417 + }, + { + "epoch": 0.8164355679938939, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8301688432693481, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.857886552810669, + "num_tokens": 244806111.0, + "step": 6418 + }, + { + "epoch": 0.8165627782724845, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9074770212173462, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8709493279457092, + "num_tokens": 244841706.0, + "step": 6419 + }, + { + "epoch": 0.8166899885510749, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0641937255859375, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8628348112106323, + "num_tokens": 244883908.0, + "step": 6420 + }, + { + "epoch": 0.8168171988296654, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.870771884918213, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8521780967712402, + "num_tokens": 244919090.0, + "step": 6421 + }, + { + "epoch": 0.8169444091082559, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.932418704032898, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8754779696464539, + "num_tokens": 244961141.0, + "step": 6422 + }, + { + "epoch": 0.8170716193868465, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8417277336120605, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8453388214111328, + "num_tokens": 245004503.0, + "step": 6423 + }, + { + "epoch": 0.817198829665437, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8621876239776611, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8582808971405029, + "num_tokens": 245045042.0, + "step": 6424 + }, + { + "epoch": 0.8173260399440275, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9122695922851562, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8588244915008545, + "num_tokens": 245085416.0, + "step": 6425 + }, + { + "epoch": 0.8174532502226179, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0351059436798096, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8494468927383423, + "num_tokens": 245116137.0, + "step": 6426 + }, + { + "epoch": 0.8175804605012085, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7535715103149414, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8718914985656738, + "num_tokens": 245159181.0, + "step": 6427 + }, + { + "epoch": 0.817707670779799, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.730615496635437, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.853588342666626, + "num_tokens": 245205199.0, + "step": 6428 + }, + { + "epoch": 0.8178348810583895, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.98381507396698, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8651081323623657, + "num_tokens": 245242455.0, + "step": 6429 + }, + { + "epoch": 0.8179620913369801, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7644132375717163, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8814162015914917, + "num_tokens": 245280675.0, + "step": 6430 + }, + { + "epoch": 0.8180893016155706, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8879146575927734, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8758382797241211, + "num_tokens": 245316947.0, + "step": 6431 + }, + { + "epoch": 0.818216511894161, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9085899591445923, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8638216257095337, + "num_tokens": 245355145.0, + "step": 6432 + }, + { + "epoch": 0.8183437221727515, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8279906511306763, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.873116135597229, + "num_tokens": 245390282.0, + "step": 6433 + }, + { + "epoch": 0.8184709324513421, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.4521701335906982, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8700004816055298, + "num_tokens": 245430262.0, + "step": 6434 + }, + { + "epoch": 0.8185981427299326, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7855991125106812, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8584136366844177, + "num_tokens": 245470874.0, + "step": 6435 + }, + { + "epoch": 0.8187253530085231, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9108926057815552, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8667447566986084, + "num_tokens": 245506626.0, + "step": 6436 + }, + { + "epoch": 0.8188525632871136, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8882218599319458, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.844334065914154, + "num_tokens": 245546794.0, + "step": 6437 + }, + { + "epoch": 0.8189797735657041, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7453277111053467, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.863768458366394, + "num_tokens": 245589544.0, + "step": 6438 + }, + { + "epoch": 0.8191069838442946, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.048893451690674, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8761804699897766, + "num_tokens": 245628889.0, + "step": 6439 + }, + { + "epoch": 0.8192341941228851, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7894728183746338, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8573369979858398, + "num_tokens": 245674615.0, + "step": 6440 + }, + { + "epoch": 0.8193614044014756, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8378232717514038, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.859295666217804, + "num_tokens": 245713966.0, + "step": 6441 + }, + { + "epoch": 0.8194886146800662, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.4591755867004395, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8648942112922668, + "num_tokens": 245750035.0, + "step": 6442 + }, + { + "epoch": 0.8196158249586567, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7738879919052124, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.875792920589447, + "num_tokens": 245788506.0, + "step": 6443 + }, + { + "epoch": 0.8197430352372471, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.872823715209961, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8637537360191345, + "num_tokens": 245826547.0, + "step": 6444 + }, + { + "epoch": 0.8198702455158376, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7396399974822998, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8513572812080383, + "num_tokens": 245874363.0, + "step": 6445 + }, + { + "epoch": 0.8199974557944282, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0374159812927246, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8428002595901489, + "num_tokens": 245917813.0, + "step": 6446 + }, + { + "epoch": 0.8201246660730187, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9485366344451904, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8569684624671936, + "num_tokens": 245952177.0, + "step": 6447 + }, + { + "epoch": 0.8202518763516092, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8817747831344604, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8636336326599121, + "num_tokens": 245992897.0, + "step": 6448 + }, + { + "epoch": 0.8203790866301998, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.845645546913147, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8688929080963135, + "num_tokens": 246029657.0, + "step": 6449 + }, + { + "epoch": 0.8205062969087902, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.832034945487976, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.851974606513977, + "num_tokens": 246072425.0, + "step": 6450 + }, + { + "epoch": 0.8206335071873807, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.6773535013198853, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8777233958244324, + "num_tokens": 246113131.0, + "step": 6451 + }, + { + "epoch": 0.8207607174659712, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8551069498062134, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8640018701553345, + "num_tokens": 246154548.0, + "step": 6452 + }, + { + "epoch": 0.8208879277445618, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9059830904006958, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8585052490234375, + "num_tokens": 246189518.0, + "step": 6453 + }, + { + "epoch": 0.8210151380231523, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7722833156585693, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8562831878662109, + "num_tokens": 246235248.0, + "step": 6454 + }, + { + "epoch": 0.8211423483017428, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.6978884935379028, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8533234596252441, + "num_tokens": 246277016.0, + "step": 6455 + }, + { + "epoch": 0.8212695585803332, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8915231227874756, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.857607364654541, + "num_tokens": 246312352.0, + "step": 6456 + }, + { + "epoch": 0.8213967688589238, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7667478322982788, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8465300798416138, + "num_tokens": 246355079.0, + "step": 6457 + }, + { + "epoch": 0.8215239791375143, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.1237709522247314, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8481886386871338, + "num_tokens": 246391073.0, + "step": 6458 + }, + { + "epoch": 0.8216511894161048, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7734051942825317, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.866902232170105, + "num_tokens": 246435407.0, + "step": 6459 + }, + { + "epoch": 0.8217783996946953, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.953349232673645, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8473653793334961, + "num_tokens": 246475303.0, + "step": 6460 + }, + { + "epoch": 0.8219056099732859, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8549175262451172, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8594812154769897, + "num_tokens": 246515237.0, + "step": 6461 + }, + { + "epoch": 0.8220328202518764, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.839312195777893, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8554359674453735, + "num_tokens": 246554958.0, + "step": 6462 + }, + { + "epoch": 0.8221600305304668, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9919793605804443, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8596604466438293, + "num_tokens": 246592327.0, + "step": 6463 + }, + { + "epoch": 0.8222872408090574, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.818790078163147, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8656878471374512, + "num_tokens": 246626965.0, + "step": 6464 + }, + { + "epoch": 0.8224144510876479, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9017658233642578, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8576293587684631, + "num_tokens": 246662049.0, + "step": 6465 + }, + { + "epoch": 0.8225416613662384, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.831529974937439, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8628076314926147, + "num_tokens": 246696989.0, + "step": 6466 + }, + { + "epoch": 0.8226688716448289, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.752013087272644, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.864757776260376, + "num_tokens": 246738418.0, + "step": 6467 + }, + { + "epoch": 0.8227960819234195, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8073794841766357, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8614089488983154, + "num_tokens": 246776779.0, + "step": 6468 + }, + { + "epoch": 0.8229232922020099, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.829504370689392, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8661748170852661, + "num_tokens": 246814029.0, + "step": 6469 + }, + { + "epoch": 0.8230505024806004, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8394941091537476, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8521013259887695, + "num_tokens": 246856390.0, + "step": 6470 + }, + { + "epoch": 0.8231777127591909, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.999555230140686, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8619090914726257, + "num_tokens": 246895563.0, + "step": 6471 + }, + { + "epoch": 0.8233049230377815, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7992844581604004, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8577272891998291, + "num_tokens": 246935956.0, + "step": 6472 + }, + { + "epoch": 0.823432133316372, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.6747630834579468, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8572452664375305, + "num_tokens": 246984765.0, + "step": 6473 + }, + { + "epoch": 0.8235593435949625, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9197133779525757, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8641054630279541, + "num_tokens": 247017086.0, + "step": 6474 + }, + { + "epoch": 0.8236865538735529, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8866599798202515, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8656095862388611, + "num_tokens": 247056195.0, + "step": 6475 + }, + { + "epoch": 0.8238137641521435, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9911553859710693, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8741053342819214, + "num_tokens": 247094063.0, + "step": 6476 + }, + { + "epoch": 0.823940974430734, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.929425835609436, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.859498918056488, + "num_tokens": 247131743.0, + "step": 6477 + }, + { + "epoch": 0.8240681847093245, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9243459701538086, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8583047389984131, + "num_tokens": 247169477.0, + "step": 6478 + }, + { + "epoch": 0.824195394987915, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0071537494659424, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8699218034744263, + "num_tokens": 247209159.0, + "step": 6479 + }, + { + "epoch": 0.8243226052665056, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8248355388641357, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8629447221755981, + "num_tokens": 247247603.0, + "step": 6480 + }, + { + "epoch": 0.824449815545096, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7328650951385498, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8706405162811279, + "num_tokens": 247290184.0, + "step": 6481 + }, + { + "epoch": 0.8245770258236865, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.896456241607666, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8428473472595215, + "num_tokens": 247331862.0, + "step": 6482 + }, + { + "epoch": 0.824704236102277, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9201918840408325, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.860058069229126, + "num_tokens": 247368271.0, + "step": 6483 + }, + { + "epoch": 0.8248314463808676, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.968659520149231, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8593283891677856, + "num_tokens": 247408077.0, + "step": 6484 + }, + { + "epoch": 0.8249586566594581, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8606375455856323, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8676748871803284, + "num_tokens": 247447408.0, + "step": 6485 + }, + { + "epoch": 0.8250858669380486, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.917551875114441, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8711032867431641, + "num_tokens": 247489863.0, + "step": 6486 + }, + { + "epoch": 0.8252130772166391, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.05014967918396, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8561290502548218, + "num_tokens": 247532233.0, + "step": 6487 + }, + { + "epoch": 0.8253402874952296, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8634806871414185, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8368936777114868, + "num_tokens": 247570719.0, + "step": 6488 + }, + { + "epoch": 0.8254674977738201, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9650744199752808, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8658532500267029, + "num_tokens": 247603302.0, + "step": 6489 + }, + { + "epoch": 0.8255947080524106, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7941572666168213, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8634784817695618, + "num_tokens": 247642837.0, + "step": 6490 + }, + { + "epoch": 0.8257219183310012, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7590214014053345, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8586305379867554, + "num_tokens": 247685246.0, + "step": 6491 + }, + { + "epoch": 0.8258491286095917, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0138421058654785, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.85188889503479, + "num_tokens": 247720871.0, + "step": 6492 + }, + { + "epoch": 0.8259763388881821, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.941132664680481, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8619198203086853, + "num_tokens": 247759796.0, + "step": 6493 + }, + { + "epoch": 0.8261035491667726, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9418004751205444, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.840142548084259, + "num_tokens": 247806486.0, + "step": 6494 + }, + { + "epoch": 0.8262307594453632, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.949100375175476, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8683772683143616, + "num_tokens": 247842031.0, + "step": 6495 + }, + { + "epoch": 0.8263579697239537, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8156522512435913, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8646516799926758, + "num_tokens": 247883438.0, + "step": 6496 + }, + { + "epoch": 0.8264851800025442, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 7.738022804260254, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.86826491355896, + "num_tokens": 247921491.0, + "step": 6497 + }, + { + "epoch": 0.8266123902811348, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9301925897598267, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8712338209152222, + "num_tokens": 247957454.0, + "step": 6498 + }, + { + "epoch": 0.8267396005597252, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.875192642211914, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8457177877426147, + "num_tokens": 247997705.0, + "step": 6499 + }, + { + "epoch": 0.8268668108383157, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8605680465698242, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8556432127952576, + "num_tokens": 248034536.0, + "step": 6500 + }, + { + "epoch": 0.8269940211169062, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.107448101043701, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8611540794372559, + "num_tokens": 248080916.0, + "step": 6501 + }, + { + "epoch": 0.8271212313954968, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8987444639205933, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8604544401168823, + "num_tokens": 248119719.0, + "step": 6502 + }, + { + "epoch": 0.8272484416740873, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9694501161575317, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8582668304443359, + "num_tokens": 248156221.0, + "step": 6503 + }, + { + "epoch": 0.8273756519526778, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 80.52055358886719, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8634070158004761, + "num_tokens": 248191780.0, + "step": 6504 + }, + { + "epoch": 0.8275028622312682, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.046034812927246, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.856255054473877, + "num_tokens": 248228594.0, + "step": 6505 + }, + { + "epoch": 0.8276300725098588, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.090414524078369, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8666026592254639, + "num_tokens": 248260372.0, + "step": 6506 + }, + { + "epoch": 0.8277572827884493, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9250330924987793, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8526510000228882, + "num_tokens": 248299361.0, + "step": 6507 + }, + { + "epoch": 0.8278844930670398, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8700463771820068, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8529209494590759, + "num_tokens": 248337370.0, + "step": 6508 + }, + { + "epoch": 0.8280117033456303, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7119126319885254, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8630553483963013, + "num_tokens": 248377541.0, + "step": 6509 + }, + { + "epoch": 0.8281389136242209, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.981299877166748, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8430162668228149, + "num_tokens": 248413928.0, + "step": 6510 + }, + { + "epoch": 0.8282661239028114, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.939632773399353, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8605738878250122, + "num_tokens": 248453283.0, + "step": 6511 + }, + { + "epoch": 0.8283933341814018, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.832184910774231, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8660295009613037, + "num_tokens": 248490286.0, + "step": 6512 + }, + { + "epoch": 0.8285205444599923, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.916805386543274, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8459452986717224, + "num_tokens": 248526349.0, + "step": 6513 + }, + { + "epoch": 0.8286477547385829, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.849655270576477, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8547181487083435, + "num_tokens": 248561751.0, + "step": 6514 + }, + { + "epoch": 0.8287749650171734, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7176998853683472, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.873843789100647, + "num_tokens": 248601921.0, + "step": 6515 + }, + { + "epoch": 0.8289021752957639, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7213091850280762, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8720225691795349, + "num_tokens": 248641218.0, + "step": 6516 + }, + { + "epoch": 0.8290293855743545, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0850605964660645, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8621374368667603, + "num_tokens": 248672616.0, + "step": 6517 + }, + { + "epoch": 0.8291565958529449, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8954904079437256, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8636771440505981, + "num_tokens": 248708212.0, + "step": 6518 + }, + { + "epoch": 0.8292838061315354, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.021141290664673, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8636854290962219, + "num_tokens": 248746575.0, + "step": 6519 + }, + { + "epoch": 0.8294110164101259, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8071074485778809, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8569944500923157, + "num_tokens": 248789423.0, + "step": 6520 + }, + { + "epoch": 0.8295382266887165, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9928009510040283, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8628064393997192, + "num_tokens": 248824044.0, + "step": 6521 + }, + { + "epoch": 0.829665436967307, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.882162094116211, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8592380285263062, + "num_tokens": 248860805.0, + "step": 6522 + }, + { + "epoch": 0.8297926472458975, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0019171237945557, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8667458295822144, + "num_tokens": 248898958.0, + "step": 6523 + }, + { + "epoch": 0.8299198575244879, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.845206618309021, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8595395088195801, + "num_tokens": 248938847.0, + "step": 6524 + }, + { + "epoch": 0.8300470678030785, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.1163618564605713, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8575040102005005, + "num_tokens": 248977944.0, + "step": 6525 + }, + { + "epoch": 0.830174278081669, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9564121961593628, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8535022735595703, + "num_tokens": 249014610.0, + "step": 6526 + }, + { + "epoch": 0.8303014883602595, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0539400577545166, + "learning_rate": 1e-06, + "loss": 0.5432, + "mean_token_accuracy": 0.8311722874641418, + "num_tokens": 249053869.0, + "step": 6527 + }, + { + "epoch": 0.83042869863885, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7977046966552734, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8533303737640381, + "num_tokens": 249096874.0, + "step": 6528 + }, + { + "epoch": 0.8305559089174406, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9224448204040527, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8587419986724854, + "num_tokens": 249133141.0, + "step": 6529 + }, + { + "epoch": 0.830683119196031, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.067883014678955, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8659428358078003, + "num_tokens": 249163813.0, + "step": 6530 + }, + { + "epoch": 0.8308103294746215, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9114634990692139, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8533801436424255, + "num_tokens": 249202935.0, + "step": 6531 + }, + { + "epoch": 0.830937539753212, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9094526767730713, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8679893016815186, + "num_tokens": 249241061.0, + "step": 6532 + }, + { + "epoch": 0.8310647500318026, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8948332071304321, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8483254909515381, + "num_tokens": 249277863.0, + "step": 6533 + }, + { + "epoch": 0.8311919603103931, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9605553150177002, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8596409559249878, + "num_tokens": 249318047.0, + "step": 6534 + }, + { + "epoch": 0.8313191705889836, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8340280055999756, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8550171852111816, + "num_tokens": 249362095.0, + "step": 6535 + }, + { + "epoch": 0.831446380867574, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8977655172348022, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8728944063186646, + "num_tokens": 249403362.0, + "step": 6536 + }, + { + "epoch": 0.8315735911461646, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7800793647766113, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8672096133232117, + "num_tokens": 249438051.0, + "step": 6537 + }, + { + "epoch": 0.8317008014247551, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8297035694122314, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8644509315490723, + "num_tokens": 249477185.0, + "step": 6538 + }, + { + "epoch": 0.8318280117033456, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0255799293518066, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8524834513664246, + "num_tokens": 249516880.0, + "step": 6539 + }, + { + "epoch": 0.8319552219819362, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7929177284240723, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8703058958053589, + "num_tokens": 249555864.0, + "step": 6540 + }, + { + "epoch": 0.8320824322605267, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9096823930740356, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.860461413860321, + "num_tokens": 249590566.0, + "step": 6541 + }, + { + "epoch": 0.8322096425391171, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 20.455982208251953, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8727298378944397, + "num_tokens": 249631523.0, + "step": 6542 + }, + { + "epoch": 0.8323368528177076, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 6.4183759689331055, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8695106506347656, + "num_tokens": 249674113.0, + "step": 6543 + }, + { + "epoch": 0.8324640630962982, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9683359861373901, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.871732771396637, + "num_tokens": 249712934.0, + "step": 6544 + }, + { + "epoch": 0.8325912733748887, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9242454767227173, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8655419945716858, + "num_tokens": 249750295.0, + "step": 6545 + }, + { + "epoch": 0.8327184836534792, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.974097490310669, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8653343915939331, + "num_tokens": 249790289.0, + "step": 6546 + }, + { + "epoch": 0.8328456939320698, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8957537412643433, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8736381530761719, + "num_tokens": 249822873.0, + "step": 6547 + }, + { + "epoch": 0.8329729042106602, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9579778909683228, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8621320128440857, + "num_tokens": 249860564.0, + "step": 6548 + }, + { + "epoch": 0.8331001144892507, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0113236904144287, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8597571849822998, + "num_tokens": 249901448.0, + "step": 6549 + }, + { + "epoch": 0.8332273247678412, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.2130770683288574, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8511642813682556, + "num_tokens": 249933524.0, + "step": 6550 + }, + { + "epoch": 0.8333545350464318, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9112889766693115, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8559281826019287, + "num_tokens": 249971856.0, + "step": 6551 + }, + { + "epoch": 0.8334817453250223, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7661104202270508, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8534833192825317, + "num_tokens": 250013940.0, + "step": 6552 + }, + { + "epoch": 0.8336089556036128, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.2538251876831055, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8423007726669312, + "num_tokens": 250044826.0, + "step": 6553 + }, + { + "epoch": 0.8337361658822032, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0132064819335938, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8449843525886536, + "num_tokens": 250087409.0, + "step": 6554 + }, + { + "epoch": 0.8338633761607938, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.749588966369629, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8686186075210571, + "num_tokens": 250126236.0, + "step": 6555 + }, + { + "epoch": 0.8339905864393843, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.810027837753296, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8688837289810181, + "num_tokens": 250163559.0, + "step": 6556 + }, + { + "epoch": 0.8341177967179748, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8734995126724243, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8611630201339722, + "num_tokens": 250203536.0, + "step": 6557 + }, + { + "epoch": 0.8342450069965653, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.946480631828308, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8671567440032959, + "num_tokens": 250237455.0, + "step": 6558 + }, + { + "epoch": 0.8343722172751559, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8051925897598267, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8690346479415894, + "num_tokens": 250274559.0, + "step": 6559 + }, + { + "epoch": 0.8344994275537464, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9553332328796387, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.86278235912323, + "num_tokens": 250309392.0, + "step": 6560 + }, + { + "epoch": 0.8346266378323368, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.757723093032837, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8656772375106812, + "num_tokens": 250352010.0, + "step": 6561 + }, + { + "epoch": 0.8347538481109273, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.3977596759796143, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8486049175262451, + "num_tokens": 250385745.0, + "step": 6562 + }, + { + "epoch": 0.8348810583895179, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.876918911933899, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.859573483467102, + "num_tokens": 250426055.0, + "step": 6563 + }, + { + "epoch": 0.8350082686681084, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.923213243484497, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.85735684633255, + "num_tokens": 250468795.0, + "step": 6564 + }, + { + "epoch": 0.8351354789466989, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0497918128967285, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8587835431098938, + "num_tokens": 250509115.0, + "step": 6565 + }, + { + "epoch": 0.8352626892252895, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9065779447555542, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.861760675907135, + "num_tokens": 250547770.0, + "step": 6566 + }, + { + "epoch": 0.8353898995038799, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.535673141479492, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8588361740112305, + "num_tokens": 250589737.0, + "step": 6567 + }, + { + "epoch": 0.8355171097824704, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.1926844120025635, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8671680688858032, + "num_tokens": 250623058.0, + "step": 6568 + }, + { + "epoch": 0.8356443200610609, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9763925075531006, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.861345112323761, + "num_tokens": 250660526.0, + "step": 6569 + }, + { + "epoch": 0.8357715303396515, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8554553985595703, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8451269865036011, + "num_tokens": 250698218.0, + "step": 6570 + }, + { + "epoch": 0.835898740618242, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.778147578239441, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8533890247344971, + "num_tokens": 250743410.0, + "step": 6571 + }, + { + "epoch": 0.8360259508968325, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9631327390670776, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8706908822059631, + "num_tokens": 250781919.0, + "step": 6572 + }, + { + "epoch": 0.8361531611754229, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0331759452819824, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8581348657608032, + "num_tokens": 250817051.0, + "step": 6573 + }, + { + "epoch": 0.8362803714540135, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.880393624305725, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8581723570823669, + "num_tokens": 250857241.0, + "step": 6574 + }, + { + "epoch": 0.836407581732604, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.267425775527954, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8608399629592896, + "num_tokens": 250903591.0, + "step": 6575 + }, + { + "epoch": 0.8365347920111945, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8951818943023682, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.854090690612793, + "num_tokens": 250945788.0, + "step": 6576 + }, + { + "epoch": 0.836662002289785, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8527649641036987, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8542534708976746, + "num_tokens": 250984884.0, + "step": 6577 + }, + { + "epoch": 0.8367892125683756, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8128918409347534, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8731249570846558, + "num_tokens": 251024347.0, + "step": 6578 + }, + { + "epoch": 0.836916422846966, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.774009346961975, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8819888830184937, + "num_tokens": 251068527.0, + "step": 6579 + }, + { + "epoch": 0.8370436331255565, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8324636220932007, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8655986785888672, + "num_tokens": 251109914.0, + "step": 6580 + }, + { + "epoch": 0.837170843404147, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9670435190200806, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8521336913108826, + "num_tokens": 251144760.0, + "step": 6581 + }, + { + "epoch": 0.8372980536827376, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 3.587564706802368, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8596042394638062, + "num_tokens": 251184456.0, + "step": 6582 + }, + { + "epoch": 0.8374252639613281, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7956280708312988, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8525580167770386, + "num_tokens": 251229493.0, + "step": 6583 + }, + { + "epoch": 0.8375524742399186, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.889156699180603, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8660507202148438, + "num_tokens": 251270886.0, + "step": 6584 + }, + { + "epoch": 0.837679684518509, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8660874366760254, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8707021474838257, + "num_tokens": 251304833.0, + "step": 6585 + }, + { + "epoch": 0.8378068947970996, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.121912717819214, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8642423152923584, + "num_tokens": 251334196.0, + "step": 6586 + }, + { + "epoch": 0.8379341050756901, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7517975568771362, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8506489992141724, + "num_tokens": 251380278.0, + "step": 6587 + }, + { + "epoch": 0.8380613153542806, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.836974859237671, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8555864691734314, + "num_tokens": 251418432.0, + "step": 6588 + }, + { + "epoch": 0.8381885256328712, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8870142698287964, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8579440116882324, + "num_tokens": 251453747.0, + "step": 6589 + }, + { + "epoch": 0.8383157359114617, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9373564720153809, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8610919713973999, + "num_tokens": 251492439.0, + "step": 6590 + }, + { + "epoch": 0.8384429461900521, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 20.344289779663086, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8634505271911621, + "num_tokens": 251527945.0, + "step": 6591 + }, + { + "epoch": 0.8385701564686426, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.197427272796631, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8703385591506958, + "num_tokens": 251567161.0, + "step": 6592 + }, + { + "epoch": 0.8386973667472332, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9696673154830933, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8620321750640869, + "num_tokens": 251601189.0, + "step": 6593 + }, + { + "epoch": 0.8388245770258237, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.979121208190918, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8592157959938049, + "num_tokens": 251630846.0, + "step": 6594 + }, + { + "epoch": 0.8389517873044142, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.312924861907959, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8682236075401306, + "num_tokens": 251665701.0, + "step": 6595 + }, + { + "epoch": 0.8390789975830047, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.036381483078003, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8523780107498169, + "num_tokens": 251699909.0, + "step": 6596 + }, + { + "epoch": 0.8392062078615952, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0818469524383545, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8580769896507263, + "num_tokens": 251734052.0, + "step": 6597 + }, + { + "epoch": 0.8393334181401857, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9047658443450928, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8488131761550903, + "num_tokens": 251772518.0, + "step": 6598 + }, + { + "epoch": 0.8394606284187762, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7394561767578125, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8580044507980347, + "num_tokens": 251816378.0, + "step": 6599 + }, + { + "epoch": 0.8395878386973668, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.947548270225525, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8591428995132446, + "num_tokens": 251851264.0, + "step": 6600 + }, + { + "epoch": 0.8397150489759573, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0623650550842285, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.861923098564148, + "num_tokens": 251888906.0, + "step": 6601 + }, + { + "epoch": 0.8398422592545478, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8744150400161743, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8671998977661133, + "num_tokens": 251930641.0, + "step": 6602 + }, + { + "epoch": 0.8399694695331382, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.868957757949829, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8427872657775879, + "num_tokens": 251968929.0, + "step": 6603 + }, + { + "epoch": 0.8400966798117288, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9564896821975708, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8523455262184143, + "num_tokens": 252006751.0, + "step": 6604 + }, + { + "epoch": 0.8402238900903193, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8656104803085327, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8565724492073059, + "num_tokens": 252045861.0, + "step": 6605 + }, + { + "epoch": 0.8403511003689098, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8775626420974731, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8533709645271301, + "num_tokens": 252084412.0, + "step": 6606 + }, + { + "epoch": 0.8404783106475003, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 7.726244926452637, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8652185201644897, + "num_tokens": 252120576.0, + "step": 6607 + }, + { + "epoch": 0.8406055209260909, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.939780592918396, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8573512434959412, + "num_tokens": 252161636.0, + "step": 6608 + }, + { + "epoch": 0.8407327312046813, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8745461702346802, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8615615963935852, + "num_tokens": 252199669.0, + "step": 6609 + }, + { + "epoch": 0.8408599414832718, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7545908689498901, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8677890300750732, + "num_tokens": 252236832.0, + "step": 6610 + }, + { + "epoch": 0.8409871517618623, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.760014533996582, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8562225103378296, + "num_tokens": 252282518.0, + "step": 6611 + }, + { + "epoch": 0.8411143620404529, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8623485565185547, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8559203147888184, + "num_tokens": 252316479.0, + "step": 6612 + }, + { + "epoch": 0.8412415723190434, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9398865699768066, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8543858528137207, + "num_tokens": 252353789.0, + "step": 6613 + }, + { + "epoch": 0.8413687825976339, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8660099506378174, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8631921410560608, + "num_tokens": 252391609.0, + "step": 6614 + }, + { + "epoch": 0.8414959928762245, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.8625599145889282, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8746064901351929, + "num_tokens": 252433514.0, + "step": 6615 + }, + { + "epoch": 0.8416232031548149, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.9018402099609375, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8644638061523438, + "num_tokens": 252469700.0, + "step": 6616 + }, + { + "epoch": 0.8417504134334054, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0201587677001953, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.857885479927063, + "num_tokens": 252501658.0, + "step": 6617 + }, + { + "epoch": 0.8418776237119959, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.0320067405700684, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8626997470855713, + "num_tokens": 252534546.0, + "step": 6618 + }, + { + "epoch": 0.8420048339905865, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8690000772476196, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8691080212593079, + "num_tokens": 252573172.0, + "step": 6619 + }, + { + "epoch": 0.842132044269177, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9344151020050049, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8697723746299744, + "num_tokens": 252607258.0, + "step": 6620 + }, + { + "epoch": 0.8422592545477675, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.728832721710205, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8709425926208496, + "num_tokens": 252651451.0, + "step": 6621 + }, + { + "epoch": 0.8423864648263579, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.796778917312622, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8724805116653442, + "num_tokens": 252686348.0, + "step": 6622 + }, + { + "epoch": 0.8425136751049485, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.78809654712677, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8552076816558838, + "num_tokens": 252724316.0, + "step": 6623 + }, + { + "epoch": 0.842640885383539, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.873463749885559, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.850460410118103, + "num_tokens": 252764612.0, + "step": 6624 + }, + { + "epoch": 0.8427680956621295, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7782435417175293, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8400927782058716, + "num_tokens": 252809510.0, + "step": 6625 + }, + { + "epoch": 0.84289530594072, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8380544185638428, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8589370250701904, + "num_tokens": 252847598.0, + "step": 6626 + }, + { + "epoch": 0.8430225162193106, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.849180817604065, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8461259603500366, + "num_tokens": 252884401.0, + "step": 6627 + }, + { + "epoch": 0.843149726497901, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9020750522613525, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8442403674125671, + "num_tokens": 252928348.0, + "step": 6628 + }, + { + "epoch": 0.8432769367764915, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9538607597351074, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8597511053085327, + "num_tokens": 252965084.0, + "step": 6629 + }, + { + "epoch": 0.843404147055082, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9677382707595825, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8642786741256714, + "num_tokens": 253003697.0, + "step": 6630 + }, + { + "epoch": 0.8435313573336726, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.0085461139678955, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8460105061531067, + "num_tokens": 253037239.0, + "step": 6631 + }, + { + "epoch": 0.8436585676122631, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.822395920753479, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.851885974407196, + "num_tokens": 253078163.0, + "step": 6632 + }, + { + "epoch": 0.8437857778908536, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.78901207447052, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8589820265769958, + "num_tokens": 253118907.0, + "step": 6633 + }, + { + "epoch": 0.843912988169444, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8190388679504395, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8784531354904175, + "num_tokens": 253153132.0, + "step": 6634 + }, + { + "epoch": 0.8440401984480346, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8859702348709106, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8568767309188843, + "num_tokens": 253194446.0, + "step": 6635 + }, + { + "epoch": 0.8441674087266251, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8114029169082642, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8628345727920532, + "num_tokens": 253230798.0, + "step": 6636 + }, + { + "epoch": 0.8442946190052156, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.849941611289978, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8533051013946533, + "num_tokens": 253269029.0, + "step": 6637 + }, + { + "epoch": 0.8444218292838062, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7466481924057007, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8567705154418945, + "num_tokens": 253310656.0, + "step": 6638 + }, + { + "epoch": 0.8445490395623967, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.762893795967102, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8777006268501282, + "num_tokens": 253346882.0, + "step": 6639 + }, + { + "epoch": 0.8446762498409871, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.773626685142517, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8509321212768555, + "num_tokens": 253389601.0, + "step": 6640 + }, + { + "epoch": 0.8448034601195776, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8395148515701294, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8589107990264893, + "num_tokens": 253428725.0, + "step": 6641 + }, + { + "epoch": 0.8449306703981682, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.9059209823608398, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8743286728858948, + "num_tokens": 253461404.0, + "step": 6642 + }, + { + "epoch": 0.8450578806767587, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8587009906768799, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.863291323184967, + "num_tokens": 253500324.0, + "step": 6643 + }, + { + "epoch": 0.8451850909553492, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8836790323257446, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8473143577575684, + "num_tokens": 253542500.0, + "step": 6644 + }, + { + "epoch": 0.8453123012339397, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.8632546663284302, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8429878354072571, + "num_tokens": 253581887.0, + "step": 6645 + }, + { + "epoch": 0.8454395115125302, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.534926176071167, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8680001497268677, + "num_tokens": 253616942.0, + "step": 6646 + }, + { + "epoch": 0.8455667217911207, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.026447296142578, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.855323314666748, + "num_tokens": 253652610.0, + "step": 6647 + }, + { + "epoch": 0.8456939320697112, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.972702980041504, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8593460321426392, + "num_tokens": 253686269.0, + "step": 6648 + }, + { + "epoch": 0.8458211423483017, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8827364444732666, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8548688292503357, + "num_tokens": 253723632.0, + "step": 6649 + }, + { + "epoch": 0.8459483526268923, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.09592866897583, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8542132377624512, + "num_tokens": 253755990.0, + "step": 6650 + }, + { + "epoch": 0.8460755629054828, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8408703804016113, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8545491695404053, + "num_tokens": 253794200.0, + "step": 6651 + }, + { + "epoch": 0.8462027731840732, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8172717094421387, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.870991587638855, + "num_tokens": 253840554.0, + "step": 6652 + }, + { + "epoch": 0.8463299834626637, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9013090133666992, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8693639636039734, + "num_tokens": 253873642.0, + "step": 6653 + }, + { + "epoch": 0.8464571937412543, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.841472864151001, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8657288551330566, + "num_tokens": 253911839.0, + "step": 6654 + }, + { + "epoch": 0.8465844040198448, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.593029737472534, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8508901596069336, + "num_tokens": 253944649.0, + "step": 6655 + }, + { + "epoch": 0.8467116142984353, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 25.294858932495117, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8607526421546936, + "num_tokens": 253977739.0, + "step": 6656 + }, + { + "epoch": 0.8468388245770259, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0826034545898438, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8458373546600342, + "num_tokens": 254021216.0, + "step": 6657 + }, + { + "epoch": 0.8469660348556163, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0396323204040527, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.857806921005249, + "num_tokens": 254063209.0, + "step": 6658 + }, + { + "epoch": 0.8470932451342068, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.021137237548828, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8547428250312805, + "num_tokens": 254102325.0, + "step": 6659 + }, + { + "epoch": 0.8472204554127973, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.90069580078125, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8652245998382568, + "num_tokens": 254136991.0, + "step": 6660 + }, + { + "epoch": 0.8473476656913879, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8623998165130615, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8604313135147095, + "num_tokens": 254179520.0, + "step": 6661 + }, + { + "epoch": 0.8474748759699784, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9943780899047852, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8601925373077393, + "num_tokens": 254220661.0, + "step": 6662 + }, + { + "epoch": 0.8476020862485689, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7728976011276245, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8679405450820923, + "num_tokens": 254261853.0, + "step": 6663 + }, + { + "epoch": 0.8477292965271594, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9359248876571655, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8618862628936768, + "num_tokens": 254301577.0, + "step": 6664 + }, + { + "epoch": 0.8478565068057499, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.758306860923767, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8738278746604919, + "num_tokens": 254341792.0, + "step": 6665 + }, + { + "epoch": 0.8479837170843404, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0558602809906006, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8477160930633545, + "num_tokens": 254376755.0, + "step": 6666 + }, + { + "epoch": 0.8481109273629309, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9142324924468994, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8697894811630249, + "num_tokens": 254413565.0, + "step": 6667 + }, + { + "epoch": 0.8482381376415215, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.881174325942993, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8495899438858032, + "num_tokens": 254449889.0, + "step": 6668 + }, + { + "epoch": 0.848365347920112, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9569512605667114, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8741951584815979, + "num_tokens": 254488129.0, + "step": 6669 + }, + { + "epoch": 0.8484925581987025, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9625298976898193, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8716628551483154, + "num_tokens": 254522031.0, + "step": 6670 + }, + { + "epoch": 0.8486197684772929, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.849550724029541, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8709195852279663, + "num_tokens": 254559249.0, + "step": 6671 + }, + { + "epoch": 0.8487469787558835, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.78350830078125, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8531257510185242, + "num_tokens": 254603968.0, + "step": 6672 + }, + { + "epoch": 0.848874189034474, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.895102620124817, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8692412376403809, + "num_tokens": 254642178.0, + "step": 6673 + }, + { + "epoch": 0.8490013993130645, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.889540433883667, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8534499406814575, + "num_tokens": 254679129.0, + "step": 6674 + }, + { + "epoch": 0.849128609591655, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.102051019668579, + "learning_rate": 1e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.831329882144928, + "num_tokens": 254714952.0, + "step": 6675 + }, + { + "epoch": 0.8492558198702456, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.060936212539673, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.857570230960846, + "num_tokens": 254743323.0, + "step": 6676 + }, + { + "epoch": 0.849383030148836, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.84002685546875, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8670844435691833, + "num_tokens": 254784749.0, + "step": 6677 + }, + { + "epoch": 0.8495102404274265, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9416000843048096, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8673880100250244, + "num_tokens": 254820102.0, + "step": 6678 + }, + { + "epoch": 0.849637450706017, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.896802306175232, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8580889105796814, + "num_tokens": 254859988.0, + "step": 6679 + }, + { + "epoch": 0.8497646609846076, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 7.737879753112793, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8560233116149902, + "num_tokens": 254906792.0, + "step": 6680 + }, + { + "epoch": 0.8498918712631981, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0776634216308594, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8626118898391724, + "num_tokens": 254941552.0, + "step": 6681 + }, + { + "epoch": 0.8500190815417886, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.152440071105957, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8528246879577637, + "num_tokens": 254974653.0, + "step": 6682 + }, + { + "epoch": 0.850146291820379, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9156228303909302, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8448854684829712, + "num_tokens": 255018418.0, + "step": 6683 + }, + { + "epoch": 0.8502735020989696, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.31101393699646, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.866068959236145, + "num_tokens": 255049626.0, + "step": 6684 + }, + { + "epoch": 0.8504007123775601, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.927120327949524, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8550581336021423, + "num_tokens": 255086323.0, + "step": 6685 + }, + { + "epoch": 0.8505279226561506, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.865907907485962, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8686425685882568, + "num_tokens": 255124102.0, + "step": 6686 + }, + { + "epoch": 0.8506551329347412, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.911455750465393, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8637065887451172, + "num_tokens": 255162176.0, + "step": 6687 + }, + { + "epoch": 0.8507823432133317, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.074160575866699, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8631640672683716, + "num_tokens": 255196256.0, + "step": 6688 + }, + { + "epoch": 0.8509095534919221, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8836190700531006, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8542985916137695, + "num_tokens": 255236752.0, + "step": 6689 + }, + { + "epoch": 0.8510367637705126, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0070016384124756, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8611494302749634, + "num_tokens": 255268694.0, + "step": 6690 + }, + { + "epoch": 0.8511639740491032, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.658159017562866, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.867012619972229, + "num_tokens": 255309445.0, + "step": 6691 + }, + { + "epoch": 0.8512911843276937, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0308775901794434, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8630545139312744, + "num_tokens": 255341126.0, + "step": 6692 + }, + { + "epoch": 0.8514183946062842, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8778643608093262, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.86894690990448, + "num_tokens": 255379553.0, + "step": 6693 + }, + { + "epoch": 0.8515456048848747, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8124297857284546, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8610080480575562, + "num_tokens": 255416588.0, + "step": 6694 + }, + { + "epoch": 0.8516728151634652, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.2936439514160156, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.847996711730957, + "num_tokens": 255449574.0, + "step": 6695 + }, + { + "epoch": 0.8518000254420557, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7769765853881836, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8582891225814819, + "num_tokens": 255494088.0, + "step": 6696 + }, + { + "epoch": 0.8519272357206462, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.2944157123565674, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.869094967842102, + "num_tokens": 255529785.0, + "step": 6697 + }, + { + "epoch": 0.8520544459992367, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9884699583053589, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8466567397117615, + "num_tokens": 255566675.0, + "step": 6698 + }, + { + "epoch": 0.8521816562778273, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8015003204345703, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8655292987823486, + "num_tokens": 255606502.0, + "step": 6699 + }, + { + "epoch": 0.8523088665564178, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9931973218917847, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8521201014518738, + "num_tokens": 255649179.0, + "step": 6700 + }, + { + "epoch": 0.8524360768350082, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9817805290222168, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.860399603843689, + "num_tokens": 255688190.0, + "step": 6701 + }, + { + "epoch": 0.8525632871135987, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7927950620651245, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8588815331459045, + "num_tokens": 255730401.0, + "step": 6702 + }, + { + "epoch": 0.8526904973921893, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.822095274925232, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8554337024688721, + "num_tokens": 255767881.0, + "step": 6703 + }, + { + "epoch": 0.8528177076707798, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7938426733016968, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8512455224990845, + "num_tokens": 255808347.0, + "step": 6704 + }, + { + "epoch": 0.8529449179493703, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8578459024429321, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8425498008728027, + "num_tokens": 255849538.0, + "step": 6705 + }, + { + "epoch": 0.8530721282279609, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8711395263671875, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8709223866462708, + "num_tokens": 255890228.0, + "step": 6706 + }, + { + "epoch": 0.8531993385065513, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8063684701919556, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8585231304168701, + "num_tokens": 255930189.0, + "step": 6707 + }, + { + "epoch": 0.8533265487851418, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7678930759429932, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8671407103538513, + "num_tokens": 255968310.0, + "step": 6708 + }, + { + "epoch": 0.8534537590637323, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9522409439086914, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8713290691375732, + "num_tokens": 256005872.0, + "step": 6709 + }, + { + "epoch": 0.8535809693423229, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.76329505443573, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8661618232727051, + "num_tokens": 256045374.0, + "step": 6710 + }, + { + "epoch": 0.8537081796209134, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.751298189163208, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8631621599197388, + "num_tokens": 256084874.0, + "step": 6711 + }, + { + "epoch": 0.8538353898995039, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9784929752349854, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8536832332611084, + "num_tokens": 256122251.0, + "step": 6712 + }, + { + "epoch": 0.8539626001780944, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.913243293762207, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8580785989761353, + "num_tokens": 256165025.0, + "step": 6713 + }, + { + "epoch": 0.8540898104566849, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7991586923599243, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8666152954101562, + "num_tokens": 256202220.0, + "step": 6714 + }, + { + "epoch": 0.8542170207352754, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9774175882339478, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8617188930511475, + "num_tokens": 256234036.0, + "step": 6715 + }, + { + "epoch": 0.8543442310138659, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.825468897819519, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8585293292999268, + "num_tokens": 256275253.0, + "step": 6716 + }, + { + "epoch": 0.8544714412924564, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8917344808578491, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8574659824371338, + "num_tokens": 256311296.0, + "step": 6717 + }, + { + "epoch": 0.854598651571047, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8177038431167603, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8658257722854614, + "num_tokens": 256348149.0, + "step": 6718 + }, + { + "epoch": 0.8547258618496375, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8867095708847046, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8743311166763306, + "num_tokens": 256387861.0, + "step": 6719 + }, + { + "epoch": 0.8548530721282279, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.3849124908447266, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8697120547294617, + "num_tokens": 256427131.0, + "step": 6720 + }, + { + "epoch": 0.8549802824068184, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7449026107788086, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8651300072669983, + "num_tokens": 256465041.0, + "step": 6721 + }, + { + "epoch": 0.855107492685409, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8376344442367554, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8521541357040405, + "num_tokens": 256502576.0, + "step": 6722 + }, + { + "epoch": 0.8552347029639995, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7768492698669434, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8629175424575806, + "num_tokens": 256540493.0, + "step": 6723 + }, + { + "epoch": 0.85536191324259, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9273326396942139, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8439000248908997, + "num_tokens": 256578115.0, + "step": 6724 + }, + { + "epoch": 0.8554891235211806, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0283966064453125, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8684431910514832, + "num_tokens": 256615957.0, + "step": 6725 + }, + { + "epoch": 0.855616333799771, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9104113578796387, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.863531231880188, + "num_tokens": 256649972.0, + "step": 6726 + }, + { + "epoch": 0.8557435440783615, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9309786558151245, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8671126365661621, + "num_tokens": 256686683.0, + "step": 6727 + }, + { + "epoch": 0.855870754356952, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9713351726531982, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8588703274726868, + "num_tokens": 256723587.0, + "step": 6728 + }, + { + "epoch": 0.8559979646355426, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8446847200393677, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8616245985031128, + "num_tokens": 256765245.0, + "step": 6729 + }, + { + "epoch": 0.8561251749141331, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.6679394245147705, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8877329230308533, + "num_tokens": 256806930.0, + "step": 6730 + }, + { + "epoch": 0.8562523851927236, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9624789953231812, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8595030903816223, + "num_tokens": 256844568.0, + "step": 6731 + }, + { + "epoch": 0.856379595471314, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9932551383972168, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8594335317611694, + "num_tokens": 256877483.0, + "step": 6732 + }, + { + "epoch": 0.8565068057499046, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8262614011764526, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.86545729637146, + "num_tokens": 256921139.0, + "step": 6733 + }, + { + "epoch": 0.8566340160284951, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9717895984649658, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8610216379165649, + "num_tokens": 256957861.0, + "step": 6734 + }, + { + "epoch": 0.8567612263070856, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.777622938156128, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8633474707603455, + "num_tokens": 256998582.0, + "step": 6735 + }, + { + "epoch": 0.8568884365856761, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8916983604431152, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8585717082023621, + "num_tokens": 257037638.0, + "step": 6736 + }, + { + "epoch": 0.8570156468642667, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8272172212600708, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8638630509376526, + "num_tokens": 257074427.0, + "step": 6737 + }, + { + "epoch": 0.8571428571428571, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8199247121810913, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8636413812637329, + "num_tokens": 257113826.0, + "step": 6738 + }, + { + "epoch": 0.8572700674214476, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0522682666778564, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.847057580947876, + "num_tokens": 257152387.0, + "step": 6739 + }, + { + "epoch": 0.8573972777000382, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.783345103263855, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8630715012550354, + "num_tokens": 257192083.0, + "step": 6740 + }, + { + "epoch": 0.8575244879786287, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7990517616271973, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8527121543884277, + "num_tokens": 257235425.0, + "step": 6741 + }, + { + "epoch": 0.8576516982572192, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0023396015167236, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.857136607170105, + "num_tokens": 257268621.0, + "step": 6742 + }, + { + "epoch": 0.8577789085358097, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7984719276428223, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8654407262802124, + "num_tokens": 257306909.0, + "step": 6743 + }, + { + "epoch": 0.8579061188144002, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8967260122299194, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8756986856460571, + "num_tokens": 257343388.0, + "step": 6744 + }, + { + "epoch": 0.8580333290929907, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.1903557777404785, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8663140535354614, + "num_tokens": 257377172.0, + "step": 6745 + }, + { + "epoch": 0.8581605393715812, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9527019262313843, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8536491394042969, + "num_tokens": 257410312.0, + "step": 6746 + }, + { + "epoch": 0.8582877496501717, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8021221160888672, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8636342883110046, + "num_tokens": 257449919.0, + "step": 6747 + }, + { + "epoch": 0.8584149599287623, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7579460144042969, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8723753094673157, + "num_tokens": 257488296.0, + "step": 6748 + }, + { + "epoch": 0.8585421702073528, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9050379991531372, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8692923188209534, + "num_tokens": 257524678.0, + "step": 6749 + }, + { + "epoch": 0.8586693804859432, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9062888622283936, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8665885925292969, + "num_tokens": 257559060.0, + "step": 6750 + }, + { + "epoch": 0.8587965907645337, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.2832109928131104, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8706883788108826, + "num_tokens": 257594340.0, + "step": 6751 + }, + { + "epoch": 0.8589238010431243, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9084925651550293, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.873164176940918, + "num_tokens": 257627936.0, + "step": 6752 + }, + { + "epoch": 0.8590510113217148, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.998684048652649, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8586840629577637, + "num_tokens": 257660311.0, + "step": 6753 + }, + { + "epoch": 0.8591782216003053, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.073383092880249, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8548272848129272, + "num_tokens": 257697496.0, + "step": 6754 + }, + { + "epoch": 0.8593054318788959, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9403188228607178, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.843217134475708, + "num_tokens": 257737242.0, + "step": 6755 + }, + { + "epoch": 0.8594326421574863, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.907968282699585, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8623412847518921, + "num_tokens": 257772946.0, + "step": 6756 + }, + { + "epoch": 0.8595598524360768, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0236258506774902, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.856163501739502, + "num_tokens": 257807198.0, + "step": 6757 + }, + { + "epoch": 0.8596870627146673, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7147315740585327, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8671707510948181, + "num_tokens": 257848618.0, + "step": 6758 + }, + { + "epoch": 0.8598142729932579, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7187871932983398, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8725817203521729, + "num_tokens": 257887805.0, + "step": 6759 + }, + { + "epoch": 0.8599414832718484, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.756664514541626, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8675096035003662, + "num_tokens": 257929276.0, + "step": 6760 + }, + { + "epoch": 0.8600686935504389, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.084298849105835, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8556162118911743, + "num_tokens": 257958629.0, + "step": 6761 + }, + { + "epoch": 0.8601959038290294, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.947156310081482, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8572970032691956, + "num_tokens": 257990303.0, + "step": 6762 + }, + { + "epoch": 0.8603231141076199, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.013939619064331, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8446089625358582, + "num_tokens": 258028704.0, + "step": 6763 + }, + { + "epoch": 0.8604503243862104, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9668800830841064, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8699782490730286, + "num_tokens": 258062757.0, + "step": 6764 + }, + { + "epoch": 0.8605775346648009, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9575526714324951, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8613044023513794, + "num_tokens": 258098574.0, + "step": 6765 + }, + { + "epoch": 0.8607047449433914, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9421137571334839, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.871171772480011, + "num_tokens": 258130937.0, + "step": 6766 + }, + { + "epoch": 0.860831955221982, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8831413984298706, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8638327717781067, + "num_tokens": 258165142.0, + "step": 6767 + }, + { + "epoch": 0.8609591655005725, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.837222695350647, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8772867321968079, + "num_tokens": 258203986.0, + "step": 6768 + }, + { + "epoch": 0.8610863757791629, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8687852621078491, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8622569441795349, + "num_tokens": 258238692.0, + "step": 6769 + }, + { + "epoch": 0.8612135860577534, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0022528171539307, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8418881893157959, + "num_tokens": 258280942.0, + "step": 6770 + }, + { + "epoch": 0.861340796336344, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9609382152557373, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8631258010864258, + "num_tokens": 258318835.0, + "step": 6771 + }, + { + "epoch": 0.8614680066149345, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7114347219467163, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8537958860397339, + "num_tokens": 258359461.0, + "step": 6772 + }, + { + "epoch": 0.861595216893525, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.128742218017578, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8577064275741577, + "num_tokens": 258394582.0, + "step": 6773 + }, + { + "epoch": 0.8617224271721156, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8533533811569214, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8647488355636597, + "num_tokens": 258438526.0, + "step": 6774 + }, + { + "epoch": 0.861849637450706, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.001297950744629, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.847813069820404, + "num_tokens": 258479332.0, + "step": 6775 + }, + { + "epoch": 0.8619768477292965, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.029862880706787, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8518781065940857, + "num_tokens": 258513202.0, + "step": 6776 + }, + { + "epoch": 0.862104058007887, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8863582611083984, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.863415002822876, + "num_tokens": 258549145.0, + "step": 6777 + }, + { + "epoch": 0.8622312682864776, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.2982497215270996, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8522696495056152, + "num_tokens": 258586421.0, + "step": 6778 + }, + { + "epoch": 0.8623584785650681, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8840869665145874, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8626078367233276, + "num_tokens": 258620014.0, + "step": 6779 + }, + { + "epoch": 0.8624856888436586, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8467600345611572, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8672947883605957, + "num_tokens": 258656124.0, + "step": 6780 + }, + { + "epoch": 0.862612899122249, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0566892623901367, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8494521379470825, + "num_tokens": 258692537.0, + "step": 6781 + }, + { + "epoch": 0.8627401094008396, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8438317775726318, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8656035661697388, + "num_tokens": 258732737.0, + "step": 6782 + }, + { + "epoch": 0.8628673196794301, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9703295230865479, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8487569689750671, + "num_tokens": 258770634.0, + "step": 6783 + }, + { + "epoch": 0.8629945299580206, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9087074995040894, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.860547661781311, + "num_tokens": 258809643.0, + "step": 6784 + }, + { + "epoch": 0.8631217402366111, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8698049783706665, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8659907579421997, + "num_tokens": 258845336.0, + "step": 6785 + }, + { + "epoch": 0.8632489505152017, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8488417863845825, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8634717464447021, + "num_tokens": 258882206.0, + "step": 6786 + }, + { + "epoch": 0.8633761607937921, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.947376012802124, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8685316443443298, + "num_tokens": 258918661.0, + "step": 6787 + }, + { + "epoch": 0.8635033710723826, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7637628316879272, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8559785485267639, + "num_tokens": 258957452.0, + "step": 6788 + }, + { + "epoch": 0.8636305813509731, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7020155191421509, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8520234823226929, + "num_tokens": 259003507.0, + "step": 6789 + }, + { + "epoch": 0.8637577916295637, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9009135961532593, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8559698462486267, + "num_tokens": 259042284.0, + "step": 6790 + }, + { + "epoch": 0.8638850019081542, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9375606775283813, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8754062056541443, + "num_tokens": 259082929.0, + "step": 6791 + }, + { + "epoch": 0.8640122121867447, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.6476932764053345, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8751018047332764, + "num_tokens": 259125920.0, + "step": 6792 + }, + { + "epoch": 0.8641394224653351, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9588468074798584, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8630834817886353, + "num_tokens": 259163140.0, + "step": 6793 + }, + { + "epoch": 0.8642666327439257, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9420819282531738, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8594644069671631, + "num_tokens": 259199038.0, + "step": 6794 + }, + { + "epoch": 0.8643938430225162, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7976292371749878, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8603559732437134, + "num_tokens": 259235713.0, + "step": 6795 + }, + { + "epoch": 0.8645210533011067, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9213911294937134, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.85074782371521, + "num_tokens": 259278314.0, + "step": 6796 + }, + { + "epoch": 0.8646482635796973, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8695443868637085, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8688916563987732, + "num_tokens": 259313039.0, + "step": 6797 + }, + { + "epoch": 0.8647754738582878, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8050611019134521, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8678662180900574, + "num_tokens": 259348845.0, + "step": 6798 + }, + { + "epoch": 0.8649026841368782, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9087941646575928, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8559474349021912, + "num_tokens": 259387345.0, + "step": 6799 + }, + { + "epoch": 0.8650298944154687, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0034282207489014, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8599922060966492, + "num_tokens": 259427835.0, + "step": 6800 + }, + { + "epoch": 0.8651571046940593, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9887137413024902, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8646876811981201, + "num_tokens": 259460453.0, + "step": 6801 + }, + { + "epoch": 0.8652843149726498, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.746487021446228, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8661133646965027, + "num_tokens": 259499296.0, + "step": 6802 + }, + { + "epoch": 0.8654115252512403, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8337973356246948, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8576802015304565, + "num_tokens": 259537649.0, + "step": 6803 + }, + { + "epoch": 0.8655387355298308, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8955621719360352, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8531317114830017, + "num_tokens": 259583136.0, + "step": 6804 + }, + { + "epoch": 0.8656659458084213, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.789124846458435, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8651180267333984, + "num_tokens": 259620422.0, + "step": 6805 + }, + { + "epoch": 0.8657931560870118, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.857347846031189, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8496304750442505, + "num_tokens": 259663948.0, + "step": 6806 + }, + { + "epoch": 0.8659203663656023, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9198068380355835, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8626867532730103, + "num_tokens": 259701986.0, + "step": 6807 + }, + { + "epoch": 0.8660475766441929, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.798845648765564, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.87613445520401, + "num_tokens": 259740684.0, + "step": 6808 + }, + { + "epoch": 0.8661747869227834, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9674173593521118, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8444817066192627, + "num_tokens": 259781659.0, + "step": 6809 + }, + { + "epoch": 0.8663019972013739, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8530058860778809, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8558419942855835, + "num_tokens": 259818554.0, + "step": 6810 + }, + { + "epoch": 0.8664292074799644, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.2627525329589844, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8627270460128784, + "num_tokens": 259856652.0, + "step": 6811 + }, + { + "epoch": 0.8665564177585549, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8120797872543335, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8794347643852234, + "num_tokens": 259898255.0, + "step": 6812 + }, + { + "epoch": 0.8666836280371454, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9464954137802124, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8660975694656372, + "num_tokens": 259935305.0, + "step": 6813 + }, + { + "epoch": 0.8668108383157359, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7396669387817383, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8645776510238647, + "num_tokens": 259975245.0, + "step": 6814 + }, + { + "epoch": 0.8669380485943264, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.687923550605774, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8667160868644714, + "num_tokens": 260014150.0, + "step": 6815 + }, + { + "epoch": 0.867065258872917, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.992221474647522, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8515996932983398, + "num_tokens": 260054655.0, + "step": 6816 + }, + { + "epoch": 0.8671924691515075, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7694958448410034, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8606024980545044, + "num_tokens": 260091528.0, + "step": 6817 + }, + { + "epoch": 0.8673196794300979, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.809558629989624, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8766718506813049, + "num_tokens": 260123069.0, + "step": 6818 + }, + { + "epoch": 0.8674468897086884, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.2266182899475098, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8452571630477905, + "num_tokens": 260154482.0, + "step": 6819 + }, + { + "epoch": 0.867574099987279, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.116581678390503, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8631986975669861, + "num_tokens": 260193983.0, + "step": 6820 + }, + { + "epoch": 0.8677013102658695, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.065925359725952, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8644395470619202, + "num_tokens": 260228705.0, + "step": 6821 + }, + { + "epoch": 0.86782852054446, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9847338199615479, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8598208427429199, + "num_tokens": 260262821.0, + "step": 6822 + }, + { + "epoch": 0.8679557308230506, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8195258378982544, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8594265580177307, + "num_tokens": 260298949.0, + "step": 6823 + }, + { + "epoch": 0.868082941101641, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8483494520187378, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.868863582611084, + "num_tokens": 260340028.0, + "step": 6824 + }, + { + "epoch": 0.8682101513802315, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9270331859588623, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8609811067581177, + "num_tokens": 260381328.0, + "step": 6825 + }, + { + "epoch": 0.868337361658822, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8285647630691528, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8613091707229614, + "num_tokens": 260423421.0, + "step": 6826 + }, + { + "epoch": 0.8684645719374126, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7857710123062134, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8598427176475525, + "num_tokens": 260461985.0, + "step": 6827 + }, + { + "epoch": 0.8685917822160031, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.761840581893921, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8469414114952087, + "num_tokens": 260503041.0, + "step": 6828 + }, + { + "epoch": 0.8687189924945936, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.991284728050232, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.869339108467102, + "num_tokens": 260545553.0, + "step": 6829 + }, + { + "epoch": 0.868846202773184, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.365833044052124, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.867157518863678, + "num_tokens": 260577160.0, + "step": 6830 + }, + { + "epoch": 0.8689734130517746, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8154759407043457, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8737636804580688, + "num_tokens": 260614748.0, + "step": 6831 + }, + { + "epoch": 0.8691006233303651, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8885585069656372, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8621724843978882, + "num_tokens": 260651762.0, + "step": 6832 + }, + { + "epoch": 0.8692278336089556, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.3988654613494873, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8497512340545654, + "num_tokens": 260694438.0, + "step": 6833 + }, + { + "epoch": 0.8693550438875461, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.3812878131866455, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8691326379776001, + "num_tokens": 260730256.0, + "step": 6834 + }, + { + "epoch": 0.8694822541661367, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9424560070037842, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8411032557487488, + "num_tokens": 260766190.0, + "step": 6835 + }, + { + "epoch": 0.8696094644447271, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7348902225494385, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8755141496658325, + "num_tokens": 260801733.0, + "step": 6836 + }, + { + "epoch": 0.8697366747233176, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8368611335754395, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8732129335403442, + "num_tokens": 260843461.0, + "step": 6837 + }, + { + "epoch": 0.8698638850019081, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9779870510101318, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8603594303131104, + "num_tokens": 260875006.0, + "step": 6838 + }, + { + "epoch": 0.8699910952804987, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8456894159317017, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8550137281417847, + "num_tokens": 260917553.0, + "step": 6839 + }, + { + "epoch": 0.8701183055590892, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9047775268554688, + "learning_rate": 1e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8376970291137695, + "num_tokens": 260955293.0, + "step": 6840 + }, + { + "epoch": 0.8702455158376797, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0532517433166504, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8621575832366943, + "num_tokens": 260983966.0, + "step": 6841 + }, + { + "epoch": 0.8703727261162701, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.884346604347229, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8743767738342285, + "num_tokens": 261019434.0, + "step": 6842 + }, + { + "epoch": 0.8704999363948607, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.861567735671997, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8494711518287659, + "num_tokens": 261053761.0, + "step": 6843 + }, + { + "epoch": 0.8706271466734512, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7941113710403442, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8517544269561768, + "num_tokens": 261093490.0, + "step": 6844 + }, + { + "epoch": 0.8707543569520417, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.020904779434204, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8628363609313965, + "num_tokens": 261129518.0, + "step": 6845 + }, + { + "epoch": 0.8708815672306323, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0213136672973633, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8452495336532593, + "num_tokens": 261164282.0, + "step": 6846 + }, + { + "epoch": 0.8710087775092228, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.780125617980957, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8402360677719116, + "num_tokens": 261205055.0, + "step": 6847 + }, + { + "epoch": 0.8711359877878132, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9396908283233643, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.854739785194397, + "num_tokens": 261242329.0, + "step": 6848 + }, + { + "epoch": 0.8712631980664037, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9128162860870361, + "learning_rate": 1e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.8336795568466187, + "num_tokens": 261280330.0, + "step": 6849 + }, + { + "epoch": 0.8713904083449943, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7071800231933594, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8726012110710144, + "num_tokens": 261318232.0, + "step": 6850 + }, + { + "epoch": 0.8715176186235848, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9441642761230469, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8779649138450623, + "num_tokens": 261351483.0, + "step": 6851 + }, + { + "epoch": 0.8716448289021753, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9393188953399658, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8636622428894043, + "num_tokens": 261387058.0, + "step": 6852 + }, + { + "epoch": 0.8717720391807658, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.6703212261199951, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8760327696800232, + "num_tokens": 261430110.0, + "step": 6853 + }, + { + "epoch": 0.8718992494593563, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.760474443435669, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8683397769927979, + "num_tokens": 261470707.0, + "step": 6854 + }, + { + "epoch": 0.8720264597379468, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8330661058425903, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8690062761306763, + "num_tokens": 261507146.0, + "step": 6855 + }, + { + "epoch": 0.8721536700165373, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8613921403884888, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8462239503860474, + "num_tokens": 261549723.0, + "step": 6856 + }, + { + "epoch": 0.8722808802951278, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8539369106292725, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8662652373313904, + "num_tokens": 261591030.0, + "step": 6857 + }, + { + "epoch": 0.8724080905737184, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.019174814224243, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8709351420402527, + "num_tokens": 261623919.0, + "step": 6858 + }, + { + "epoch": 0.8725353008523089, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9226152896881104, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8780882358551025, + "num_tokens": 261655277.0, + "step": 6859 + }, + { + "epoch": 0.8726625111308994, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9192564487457275, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8491765260696411, + "num_tokens": 261697462.0, + "step": 6860 + }, + { + "epoch": 0.8727897214094898, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8800346851348877, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8589536547660828, + "num_tokens": 261739134.0, + "step": 6861 + }, + { + "epoch": 0.8729169316880804, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9082281589508057, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8648331165313721, + "num_tokens": 261776295.0, + "step": 6862 + }, + { + "epoch": 0.8730441419666709, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0365915298461914, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8641782999038696, + "num_tokens": 261812388.0, + "step": 6863 + }, + { + "epoch": 0.8731713522452614, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7752948999404907, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.862933874130249, + "num_tokens": 261855537.0, + "step": 6864 + }, + { + "epoch": 0.873298562523852, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.654241681098938, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8700190186500549, + "num_tokens": 261899922.0, + "step": 6865 + }, + { + "epoch": 0.8734257728024425, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8738325834274292, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8736685514450073, + "num_tokens": 261934475.0, + "step": 6866 + }, + { + "epoch": 0.8735529830810329, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.0118408203125, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8613986372947693, + "num_tokens": 261973395.0, + "step": 6867 + }, + { + "epoch": 0.8736801933596234, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.146109104156494, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8456152677536011, + "num_tokens": 262011644.0, + "step": 6868 + }, + { + "epoch": 0.873807403638214, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.176475763320923, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8577082753181458, + "num_tokens": 262044038.0, + "step": 6869 + }, + { + "epoch": 0.8739346139168045, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8879402875900269, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8607714176177979, + "num_tokens": 262081023.0, + "step": 6870 + }, + { + "epoch": 0.874061824195395, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.765092134475708, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8535759449005127, + "num_tokens": 262123981.0, + "step": 6871 + }, + { + "epoch": 0.8741890344739855, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.1253740787506104, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8505645990371704, + "num_tokens": 262161542.0, + "step": 6872 + }, + { + "epoch": 0.874316244752576, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8492765426635742, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8510528206825256, + "num_tokens": 262202482.0, + "step": 6873 + }, + { + "epoch": 0.8744434550311665, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9885168075561523, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.862317681312561, + "num_tokens": 262239499.0, + "step": 6874 + }, + { + "epoch": 0.874570665309757, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.725893974304199, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8634120225906372, + "num_tokens": 262281324.0, + "step": 6875 + }, + { + "epoch": 0.8746978755883476, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.04732346534729, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8526707887649536, + "num_tokens": 262318772.0, + "step": 6876 + }, + { + "epoch": 0.8748250858669381, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9253414869308472, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.873810887336731, + "num_tokens": 262353264.0, + "step": 6877 + }, + { + "epoch": 0.8749522961455286, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7896389961242676, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8590731620788574, + "num_tokens": 262393360.0, + "step": 6878 + }, + { + "epoch": 0.875079506424119, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9173868894577026, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8721973299980164, + "num_tokens": 262432010.0, + "step": 6879 + }, + { + "epoch": 0.8752067167027096, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9849425554275513, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8622244596481323, + "num_tokens": 262468828.0, + "step": 6880 + }, + { + "epoch": 0.8753339269813001, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8192811012268066, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8689848184585571, + "num_tokens": 262515141.0, + "step": 6881 + }, + { + "epoch": 0.8754611372598906, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8875279426574707, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8596259355545044, + "num_tokens": 262553137.0, + "step": 6882 + }, + { + "epoch": 0.8755883475384811, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7524093389511108, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8845720887184143, + "num_tokens": 262593847.0, + "step": 6883 + }, + { + "epoch": 0.8757155578170717, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9392552375793457, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8509631156921387, + "num_tokens": 262635048.0, + "step": 6884 + }, + { + "epoch": 0.8758427680956621, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8966957330703735, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8527647256851196, + "num_tokens": 262671200.0, + "step": 6885 + }, + { + "epoch": 0.8759699783742526, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.933705449104309, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.863507866859436, + "num_tokens": 262708304.0, + "step": 6886 + }, + { + "epoch": 0.8760971886528431, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.6858426332473755, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8594896793365479, + "num_tokens": 262750680.0, + "step": 6887 + }, + { + "epoch": 0.8762243989314337, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.2774746417999268, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8565775156021118, + "num_tokens": 262784845.0, + "step": 6888 + }, + { + "epoch": 0.8763516092100242, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9722528457641602, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8621295690536499, + "num_tokens": 262822646.0, + "step": 6889 + }, + { + "epoch": 0.8764788194886147, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9729433059692383, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8575617074966431, + "num_tokens": 262863663.0, + "step": 6890 + }, + { + "epoch": 0.8766060297672051, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9622231721878052, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8482664823532104, + "num_tokens": 262903726.0, + "step": 6891 + }, + { + "epoch": 0.8767332400457957, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.925403356552124, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8516772389411926, + "num_tokens": 262942748.0, + "step": 6892 + }, + { + "epoch": 0.8768604503243862, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.9793246984481812, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8415374159812927, + "num_tokens": 262977639.0, + "step": 6893 + }, + { + "epoch": 0.8769876606029767, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8463284969329834, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8646098971366882, + "num_tokens": 263015970.0, + "step": 6894 + }, + { + "epoch": 0.8771148708815673, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8052841424942017, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8718286752700806, + "num_tokens": 263058618.0, + "step": 6895 + }, + { + "epoch": 0.8772420811601578, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8855730295181274, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8457919359207153, + "num_tokens": 263098628.0, + "step": 6896 + }, + { + "epoch": 0.8773692914387482, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9122346639633179, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8649719953536987, + "num_tokens": 263136793.0, + "step": 6897 + }, + { + "epoch": 0.8774965017173387, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.161062002182007, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8634082674980164, + "num_tokens": 263170996.0, + "step": 6898 + }, + { + "epoch": 0.8776237119959293, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.4465816020965576, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8524027466773987, + "num_tokens": 263199961.0, + "step": 6899 + }, + { + "epoch": 0.8777509222745198, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8769230842590332, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8607396483421326, + "num_tokens": 263241541.0, + "step": 6900 + }, + { + "epoch": 0.8778781325531103, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8224729299545288, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8744500875473022, + "num_tokens": 263278778.0, + "step": 6901 + }, + { + "epoch": 0.8780053428317008, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.029906749725342, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8648825287818909, + "num_tokens": 263310155.0, + "step": 6902 + }, + { + "epoch": 0.8781325531102913, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7580153942108154, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8578678965568542, + "num_tokens": 263355793.0, + "step": 6903 + }, + { + "epoch": 0.8782597633888818, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9118531942367554, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.871037483215332, + "num_tokens": 263394446.0, + "step": 6904 + }, + { + "epoch": 0.8783869736674723, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7243304252624512, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8636389970779419, + "num_tokens": 263438479.0, + "step": 6905 + }, + { + "epoch": 0.8785141839460628, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8482730388641357, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8476088643074036, + "num_tokens": 263478514.0, + "step": 6906 + }, + { + "epoch": 0.8786413942246534, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0027027130126953, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8426035642623901, + "num_tokens": 263516734.0, + "step": 6907 + }, + { + "epoch": 0.8787686045032439, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7968049049377441, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8504959344863892, + "num_tokens": 263560105.0, + "step": 6908 + }, + { + "epoch": 0.8788958147818343, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9844666719436646, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8633391857147217, + "num_tokens": 263597033.0, + "step": 6909 + }, + { + "epoch": 0.8790230250604248, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7931568622589111, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8686622381210327, + "num_tokens": 263631106.0, + "step": 6910 + }, + { + "epoch": 0.8791502353390154, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.078686475753784, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8555774688720703, + "num_tokens": 263668227.0, + "step": 6911 + }, + { + "epoch": 0.8792774456176059, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8326843976974487, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8637897372245789, + "num_tokens": 263704335.0, + "step": 6912 + }, + { + "epoch": 0.8794046558961964, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7466357946395874, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8667608499526978, + "num_tokens": 263742532.0, + "step": 6913 + }, + { + "epoch": 0.879531866174787, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9057085514068604, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8617665767669678, + "num_tokens": 263782321.0, + "step": 6914 + }, + { + "epoch": 0.8796590764533775, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9518707990646362, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8688105344772339, + "num_tokens": 263818037.0, + "step": 6915 + }, + { + "epoch": 0.8797862867319679, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.252285957336426, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8471864461898804, + "num_tokens": 263854901.0, + "step": 6916 + }, + { + "epoch": 0.8799134970105584, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8127869367599487, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8666760921478271, + "num_tokens": 263896241.0, + "step": 6917 + }, + { + "epoch": 0.880040707289149, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7639498710632324, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.863601803779602, + "num_tokens": 263934673.0, + "step": 6918 + }, + { + "epoch": 0.8801679175677395, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.834424614906311, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8538534641265869, + "num_tokens": 263973456.0, + "step": 6919 + }, + { + "epoch": 0.88029512784633, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.973284125328064, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8449828028678894, + "num_tokens": 264006050.0, + "step": 6920 + }, + { + "epoch": 0.8804223381249205, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8417071104049683, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8718844652175903, + "num_tokens": 264042736.0, + "step": 6921 + }, + { + "epoch": 0.880549548403511, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.908634066581726, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8533806800842285, + "num_tokens": 264084433.0, + "step": 6922 + }, + { + "epoch": 0.8806767586821015, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9190855026245117, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8695387840270996, + "num_tokens": 264118331.0, + "step": 6923 + }, + { + "epoch": 0.880803968960692, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9762212038040161, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8635833263397217, + "num_tokens": 264152015.0, + "step": 6924 + }, + { + "epoch": 0.8809311792392825, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8910413980484009, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8671631813049316, + "num_tokens": 264187085.0, + "step": 6925 + }, + { + "epoch": 0.8810583895178731, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8861973285675049, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8581969738006592, + "num_tokens": 264225558.0, + "step": 6926 + }, + { + "epoch": 0.8811855997964636, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.199960708618164, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8516426086425781, + "num_tokens": 264264220.0, + "step": 6927 + }, + { + "epoch": 0.881312810075054, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0110764503479004, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8651484847068787, + "num_tokens": 264302097.0, + "step": 6928 + }, + { + "epoch": 0.8814400203536445, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.842931866645813, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8632118105888367, + "num_tokens": 264341966.0, + "step": 6929 + }, + { + "epoch": 0.8815672306322351, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9047569036483765, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8380520343780518, + "num_tokens": 264384298.0, + "step": 6930 + }, + { + "epoch": 0.8816944409108256, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9049772024154663, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8618784546852112, + "num_tokens": 264422513.0, + "step": 6931 + }, + { + "epoch": 0.8818216511894161, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.351649522781372, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8744363784790039, + "num_tokens": 264461819.0, + "step": 6932 + }, + { + "epoch": 0.8819488614680067, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8344380855560303, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8509382009506226, + "num_tokens": 264502618.0, + "step": 6933 + }, + { + "epoch": 0.8820760717465971, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.947210431098938, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8551309108734131, + "num_tokens": 264534382.0, + "step": 6934 + }, + { + "epoch": 0.8822032820251876, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.755864143371582, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8519126772880554, + "num_tokens": 264575297.0, + "step": 6935 + }, + { + "epoch": 0.8823304923037781, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9104104042053223, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8677830696105957, + "num_tokens": 264611822.0, + "step": 6936 + }, + { + "epoch": 0.8824577025823687, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7585369348526, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8565533757209778, + "num_tokens": 264657483.0, + "step": 6937 + }, + { + "epoch": 0.8825849128609592, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0580289363861084, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8668468594551086, + "num_tokens": 264690633.0, + "step": 6938 + }, + { + "epoch": 0.8827121231395497, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7439600229263306, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8765207529067993, + "num_tokens": 264733268.0, + "step": 6939 + }, + { + "epoch": 0.8828393334181401, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7955129146575928, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8536444902420044, + "num_tokens": 264774769.0, + "step": 6940 + }, + { + "epoch": 0.8829665436967307, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.974247694015503, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8565524816513062, + "num_tokens": 264806658.0, + "step": 6941 + }, + { + "epoch": 0.8830937539753212, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.941526174545288, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8488933444023132, + "num_tokens": 264841054.0, + "step": 6942 + }, + { + "epoch": 0.8832209642539117, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8482587337493896, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8556817770004272, + "num_tokens": 264880241.0, + "step": 6943 + }, + { + "epoch": 0.8833481745325023, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8598120212554932, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8626364469528198, + "num_tokens": 264922449.0, + "step": 6944 + }, + { + "epoch": 0.8834753848110928, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.874953031539917, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.859333872795105, + "num_tokens": 264961605.0, + "step": 6945 + }, + { + "epoch": 0.8836025950896832, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9551217555999756, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8699474334716797, + "num_tokens": 264992924.0, + "step": 6946 + }, + { + "epoch": 0.8837298053682737, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.010275363922119, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.860675573348999, + "num_tokens": 265031849.0, + "step": 6947 + }, + { + "epoch": 0.8838570156468643, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8362195491790771, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.873051643371582, + "num_tokens": 265069714.0, + "step": 6948 + }, + { + "epoch": 0.8839842259254548, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.960381269454956, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.856425404548645, + "num_tokens": 265103325.0, + "step": 6949 + }, + { + "epoch": 0.8841114362040453, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9668428897857666, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8397417664527893, + "num_tokens": 265139133.0, + "step": 6950 + }, + { + "epoch": 0.8842386464826358, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7862261533737183, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8366685509681702, + "num_tokens": 265183536.0, + "step": 6951 + }, + { + "epoch": 0.8843658567612263, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8369395732879639, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8527905344963074, + "num_tokens": 265226288.0, + "step": 6952 + }, + { + "epoch": 0.8844930670398168, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.859442949295044, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8605183362960815, + "num_tokens": 265265846.0, + "step": 6953 + }, + { + "epoch": 0.8846202773184073, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.827915906906128, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8749934434890747, + "num_tokens": 265300612.0, + "step": 6954 + }, + { + "epoch": 0.8847474875969978, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.7308106422424316, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8690844774246216, + "num_tokens": 265341440.0, + "step": 6955 + }, + { + "epoch": 0.8848746978755884, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8773467540740967, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8515245914459229, + "num_tokens": 265384854.0, + "step": 6956 + }, + { + "epoch": 0.8850019081541789, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9511988162994385, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8542870283126831, + "num_tokens": 265420611.0, + "step": 6957 + }, + { + "epoch": 0.8851291184327693, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9978060722351074, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8586583137512207, + "num_tokens": 265457720.0, + "step": 6958 + }, + { + "epoch": 0.8852563287113598, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.2216286659240723, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8695082068443298, + "num_tokens": 265493434.0, + "step": 6959 + }, + { + "epoch": 0.8853835389899504, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9401251077651978, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8629546761512756, + "num_tokens": 265529015.0, + "step": 6960 + }, + { + "epoch": 0.8855107492685409, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7847213745117188, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.845923662185669, + "num_tokens": 265568698.0, + "step": 6961 + }, + { + "epoch": 0.8856379595471314, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8113189935684204, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8623497486114502, + "num_tokens": 265605631.0, + "step": 6962 + }, + { + "epoch": 0.885765169825722, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0310404300689697, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8624544739723206, + "num_tokens": 265647071.0, + "step": 6963 + }, + { + "epoch": 0.8858923801043125, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7996548414230347, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8602995276451111, + "num_tokens": 265688680.0, + "step": 6964 + }, + { + "epoch": 0.8860195903829029, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8183553218841553, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8659933805465698, + "num_tokens": 265727707.0, + "step": 6965 + }, + { + "epoch": 0.8861468006614934, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.809626817703247, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8583263158798218, + "num_tokens": 265765860.0, + "step": 6966 + }, + { + "epoch": 0.886274010940084, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.065488815307617, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8641982078552246, + "num_tokens": 265802988.0, + "step": 6967 + }, + { + "epoch": 0.8864012212186745, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.16717267036438, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8554395437240601, + "num_tokens": 265840614.0, + "step": 6968 + }, + { + "epoch": 0.886528431497265, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8737208843231201, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8338972330093384, + "num_tokens": 265883032.0, + "step": 6969 + }, + { + "epoch": 0.8866556417758555, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.882731556892395, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8728039860725403, + "num_tokens": 265918309.0, + "step": 6970 + }, + { + "epoch": 0.886782852054446, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8457609415054321, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8632848262786865, + "num_tokens": 265956406.0, + "step": 6971 + }, + { + "epoch": 0.8869100623330365, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0027873516082764, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8694568872451782, + "num_tokens": 265988413.0, + "step": 6972 + }, + { + "epoch": 0.887037272611627, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8958983421325684, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8607255220413208, + "num_tokens": 266022036.0, + "step": 6973 + }, + { + "epoch": 0.8871644828902175, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0125362873077393, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8592710494995117, + "num_tokens": 266053735.0, + "step": 6974 + }, + { + "epoch": 0.8872916931688081, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7496789693832397, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8742130994796753, + "num_tokens": 266094198.0, + "step": 6975 + }, + { + "epoch": 0.8874189034473986, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8987656831741333, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8693360090255737, + "num_tokens": 266132213.0, + "step": 6976 + }, + { + "epoch": 0.887546113725989, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.3576343059539795, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8611891269683838, + "num_tokens": 266168001.0, + "step": 6977 + }, + { + "epoch": 0.8876733240045795, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9721426963806152, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8317554593086243, + "num_tokens": 266208771.0, + "step": 6978 + }, + { + "epoch": 0.8878005342831701, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.2975072860717773, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8669354915618896, + "num_tokens": 266248090.0, + "step": 6979 + }, + { + "epoch": 0.8879277445617606, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.935041904449463, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8511795997619629, + "num_tokens": 266283748.0, + "step": 6980 + }, + { + "epoch": 0.8880549548403511, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.977228045463562, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8676954507827759, + "num_tokens": 266316613.0, + "step": 6981 + }, + { + "epoch": 0.8881821651189417, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8111999034881592, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8637370467185974, + "num_tokens": 266357515.0, + "step": 6982 + }, + { + "epoch": 0.8883093753975321, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8911863565444946, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8732309937477112, + "num_tokens": 266393593.0, + "step": 6983 + }, + { + "epoch": 0.8884365856761226, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.819097638130188, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8725935220718384, + "num_tokens": 266436104.0, + "step": 6984 + }, + { + "epoch": 0.8885637959547131, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8054659366607666, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8736971020698547, + "num_tokens": 266472809.0, + "step": 6985 + }, + { + "epoch": 0.8886910062333037, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.290132999420166, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8710405826568604, + "num_tokens": 266512726.0, + "step": 6986 + }, + { + "epoch": 0.8888182165118942, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.155134916305542, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8514922857284546, + "num_tokens": 266546098.0, + "step": 6987 + }, + { + "epoch": 0.8889454267904847, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.257809638977051, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8521206974983215, + "num_tokens": 266588478.0, + "step": 6988 + }, + { + "epoch": 0.8890726370690751, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.789392113685608, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8495227098464966, + "num_tokens": 266628975.0, + "step": 6989 + }, + { + "epoch": 0.8891998473476657, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.6782567501068115, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8697814345359802, + "num_tokens": 266667533.0, + "step": 6990 + }, + { + "epoch": 0.8893270576262562, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9370535612106323, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8425868153572083, + "num_tokens": 266706990.0, + "step": 6991 + }, + { + "epoch": 0.8894542679048467, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9166737794876099, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8478994369506836, + "num_tokens": 266747314.0, + "step": 6992 + }, + { + "epoch": 0.8895814781834372, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.6609910726547241, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8518536686897278, + "num_tokens": 266791031.0, + "step": 6993 + }, + { + "epoch": 0.8897086884620278, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9432891607284546, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8616652488708496, + "num_tokens": 266823236.0, + "step": 6994 + }, + { + "epoch": 0.8898358987406182, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8572866916656494, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8598624467849731, + "num_tokens": 266867993.0, + "step": 6995 + }, + { + "epoch": 0.8899631090192087, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7558127641677856, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8432705402374268, + "num_tokens": 266908573.0, + "step": 6996 + }, + { + "epoch": 0.8900903192977992, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7557307481765747, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8586230874061584, + "num_tokens": 266951167.0, + "step": 6997 + }, + { + "epoch": 0.8902175295763898, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 1.8868008852005005, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8592910170555115, + "num_tokens": 266988976.0, + "step": 6998 + }, + { + "epoch": 0.8903447398549803, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9444717168807983, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8599986433982849, + "num_tokens": 267022450.0, + "step": 6999 + }, + { + "epoch": 0.8904719501335708, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9273141622543335, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8557140827178955, + "num_tokens": 267059933.0, + "step": 7000 + }, + { + "epoch": 0.8905991604121613, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9524250030517578, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8516996502876282, + "num_tokens": 267092518.0, + "step": 7001 + }, + { + "epoch": 0.8907263706907518, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.585073471069336, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8559972047805786, + "num_tokens": 267130008.0, + "step": 7002 + }, + { + "epoch": 0.8908535809693423, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8654041290283203, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8635849952697754, + "num_tokens": 267171039.0, + "step": 7003 + }, + { + "epoch": 0.8909807912479328, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8567615747451782, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8637343049049377, + "num_tokens": 267211197.0, + "step": 7004 + }, + { + "epoch": 0.8911080015265234, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7974928617477417, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8658510446548462, + "num_tokens": 267247471.0, + "step": 7005 + }, + { + "epoch": 0.8912352118051139, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0684492588043213, + "learning_rate": 1e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8341584205627441, + "num_tokens": 267287276.0, + "step": 7006 + }, + { + "epoch": 0.8913624220837043, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8934515714645386, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8576580286026001, + "num_tokens": 267323827.0, + "step": 7007 + }, + { + "epoch": 0.8914896323622948, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.896403193473816, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.869165301322937, + "num_tokens": 267364053.0, + "step": 7008 + }, + { + "epoch": 0.8916168426408854, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9542715549468994, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8444627523422241, + "num_tokens": 267404393.0, + "step": 7009 + }, + { + "epoch": 0.8917440529194759, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8929798603057861, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8615008592605591, + "num_tokens": 267444589.0, + "step": 7010 + }, + { + "epoch": 0.8918712631980664, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8981844186782837, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8616575598716736, + "num_tokens": 267484460.0, + "step": 7011 + }, + { + "epoch": 0.891998473476657, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0210764408111572, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8537534475326538, + "num_tokens": 267517390.0, + "step": 7012 + }, + { + "epoch": 0.8921256837552475, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8092366456985474, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.86091148853302, + "num_tokens": 267555813.0, + "step": 7013 + }, + { + "epoch": 0.8922528940338379, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8425521850585938, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8584553003311157, + "num_tokens": 267595818.0, + "step": 7014 + }, + { + "epoch": 0.8923801043124284, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.018765687942505, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.834732174873352, + "num_tokens": 267633106.0, + "step": 7015 + }, + { + "epoch": 0.892507314591019, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0188915729522705, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8628813028335571, + "num_tokens": 267670166.0, + "step": 7016 + }, + { + "epoch": 0.8926345248696095, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9105195999145508, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8568340539932251, + "num_tokens": 267707284.0, + "step": 7017 + }, + { + "epoch": 0.8927617351482, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.868636131286621, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8604080677032471, + "num_tokens": 267742349.0, + "step": 7018 + }, + { + "epoch": 0.8928889454267905, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.4839282035827637, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8657172918319702, + "num_tokens": 267776003.0, + "step": 7019 + }, + { + "epoch": 0.893016155705381, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9523712396621704, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.857138991355896, + "num_tokens": 267815887.0, + "step": 7020 + }, + { + "epoch": 0.8931433659839715, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.937799334526062, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8700960278511047, + "num_tokens": 267855874.0, + "step": 7021 + }, + { + "epoch": 0.893270576262562, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7849111557006836, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8702093958854675, + "num_tokens": 267897901.0, + "step": 7022 + }, + { + "epoch": 0.8933977865411525, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.6602685451507568, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8703693747520447, + "num_tokens": 267938967.0, + "step": 7023 + }, + { + "epoch": 0.8935249968197431, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9889745712280273, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8561633229255676, + "num_tokens": 267975196.0, + "step": 7024 + }, + { + "epoch": 0.8936522070983336, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7511345148086548, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8744165897369385, + "num_tokens": 268014407.0, + "step": 7025 + }, + { + "epoch": 0.893779417376924, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9934697151184082, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8744380474090576, + "num_tokens": 268043892.0, + "step": 7026 + }, + { + "epoch": 0.8939066276555145, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8693801164627075, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8598649501800537, + "num_tokens": 268079651.0, + "step": 7027 + }, + { + "epoch": 0.8940338379341051, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9004545211791992, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8636487722396851, + "num_tokens": 268114021.0, + "step": 7028 + }, + { + "epoch": 0.8941610482126956, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.870044469833374, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.859139084815979, + "num_tokens": 268155315.0, + "step": 7029 + }, + { + "epoch": 0.8942882584912861, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9596045017242432, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8569942116737366, + "num_tokens": 268199312.0, + "step": 7030 + }, + { + "epoch": 0.8944154687698767, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.791785717010498, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8662745952606201, + "num_tokens": 268236955.0, + "step": 7031 + }, + { + "epoch": 0.8945426790484671, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8684799671173096, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.857461154460907, + "num_tokens": 268272183.0, + "step": 7032 + }, + { + "epoch": 0.8946698893270576, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9272608757019043, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8513494729995728, + "num_tokens": 268305407.0, + "step": 7033 + }, + { + "epoch": 0.8947970996056481, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.1286745071411133, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8715826869010925, + "num_tokens": 268340443.0, + "step": 7034 + }, + { + "epoch": 0.8949243098842387, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9331955909729004, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8796037435531616, + "num_tokens": 268375597.0, + "step": 7035 + }, + { + "epoch": 0.8950515201628292, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9178357124328613, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8547358512878418, + "num_tokens": 268412872.0, + "step": 7036 + }, + { + "epoch": 0.8951787304414197, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8082923889160156, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8609321117401123, + "num_tokens": 268452021.0, + "step": 7037 + }, + { + "epoch": 0.8953059407200101, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9248034954071045, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.868866503238678, + "num_tokens": 268486354.0, + "step": 7038 + }, + { + "epoch": 0.8954331509986007, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.041010856628418, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8561296463012695, + "num_tokens": 268519918.0, + "step": 7039 + }, + { + "epoch": 0.8955603612771912, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9708973169326782, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8530137538909912, + "num_tokens": 268554259.0, + "step": 7040 + }, + { + "epoch": 0.8956875715557817, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9463988542556763, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8539962768554688, + "num_tokens": 268595785.0, + "step": 7041 + }, + { + "epoch": 0.8958147818343722, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9147599935531616, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8447959423065186, + "num_tokens": 268640805.0, + "step": 7042 + }, + { + "epoch": 0.8959419921129628, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9074350595474243, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.865740180015564, + "num_tokens": 268682446.0, + "step": 7043 + }, + { + "epoch": 0.8960692023915532, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8229178190231323, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.869127631187439, + "num_tokens": 268720262.0, + "step": 7044 + }, + { + "epoch": 0.8961964126701437, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9112310409545898, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8479770421981812, + "num_tokens": 268760519.0, + "step": 7045 + }, + { + "epoch": 0.8963236229487342, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.910055160522461, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8541558980941772, + "num_tokens": 268796621.0, + "step": 7046 + }, + { + "epoch": 0.8964508332273248, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9648512601852417, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8692995309829712, + "num_tokens": 268830527.0, + "step": 7047 + }, + { + "epoch": 0.8965780435059153, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8985989093780518, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8663270473480225, + "num_tokens": 268862112.0, + "step": 7048 + }, + { + "epoch": 0.8967052537845058, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 16.603654861450195, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8587583303451538, + "num_tokens": 268901089.0, + "step": 7049 + }, + { + "epoch": 0.8968324640630962, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8584550619125366, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8621457815170288, + "num_tokens": 268942766.0, + "step": 7050 + }, + { + "epoch": 0.8969596743416868, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.2086923122406006, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.865459680557251, + "num_tokens": 268978642.0, + "step": 7051 + }, + { + "epoch": 0.8970868846202773, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0089097023010254, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8667346239089966, + "num_tokens": 269016048.0, + "step": 7052 + }, + { + "epoch": 0.8972140948988678, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8749337196350098, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8608176112174988, + "num_tokens": 269054381.0, + "step": 7053 + }, + { + "epoch": 0.8973413051774584, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7512073516845703, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8642809391021729, + "num_tokens": 269090672.0, + "step": 7054 + }, + { + "epoch": 0.8974685154560489, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8643518686294556, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.864500880241394, + "num_tokens": 269130231.0, + "step": 7055 + }, + { + "epoch": 0.8975957257346393, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9650861024856567, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8651976585388184, + "num_tokens": 269166282.0, + "step": 7056 + }, + { + "epoch": 0.8977229360132298, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8975626230239868, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8648906946182251, + "num_tokens": 269205326.0, + "step": 7057 + }, + { + "epoch": 0.8978501462918204, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7899425029754639, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8624230027198792, + "num_tokens": 269243903.0, + "step": 7058 + }, + { + "epoch": 0.8979773565704109, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7717509269714355, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8548936247825623, + "num_tokens": 269288330.0, + "step": 7059 + }, + { + "epoch": 0.8981045668490014, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8087438344955444, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8511675596237183, + "num_tokens": 269328731.0, + "step": 7060 + }, + { + "epoch": 0.898231777127592, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7975739240646362, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8624813556671143, + "num_tokens": 269369576.0, + "step": 7061 + }, + { + "epoch": 0.8983589874061825, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8710167407989502, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.859626054763794, + "num_tokens": 269407734.0, + "step": 7062 + }, + { + "epoch": 0.8984861976847729, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9764693975448608, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8403728008270264, + "num_tokens": 269444042.0, + "step": 7063 + }, + { + "epoch": 0.8986134079633634, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.94465970993042, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8850758075714111, + "num_tokens": 269480979.0, + "step": 7064 + }, + { + "epoch": 0.898740618241954, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9654680490493774, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8620197772979736, + "num_tokens": 269515594.0, + "step": 7065 + }, + { + "epoch": 0.8988678285205445, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7432456016540527, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8633441925048828, + "num_tokens": 269556974.0, + "step": 7066 + }, + { + "epoch": 0.898995038799135, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9241529703140259, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8559964299201965, + "num_tokens": 269591088.0, + "step": 7067 + }, + { + "epoch": 0.8991222490777255, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.021357297897339, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8698934316635132, + "num_tokens": 269627921.0, + "step": 7068 + }, + { + "epoch": 0.899249459356316, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9670785665512085, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8456772565841675, + "num_tokens": 269666586.0, + "step": 7069 + }, + { + "epoch": 0.8993766696349065, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.1709036827087402, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8629688620567322, + "num_tokens": 269701205.0, + "step": 7070 + }, + { + "epoch": 0.899503879913497, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 16.980594635009766, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.874550461769104, + "num_tokens": 269739982.0, + "step": 7071 + }, + { + "epoch": 0.8996310901920875, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.1690216064453125, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8621500134468079, + "num_tokens": 269773048.0, + "step": 7072 + }, + { + "epoch": 0.8997583004706781, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9894646406173706, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8608880043029785, + "num_tokens": 269811343.0, + "step": 7073 + }, + { + "epoch": 0.8998855107492686, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9299802780151367, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.866153359413147, + "num_tokens": 269851601.0, + "step": 7074 + }, + { + "epoch": 0.900012721027859, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7311495542526245, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8699625134468079, + "num_tokens": 269891396.0, + "step": 7075 + }, + { + "epoch": 0.9001399313064495, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.997104287147522, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8610963821411133, + "num_tokens": 269930292.0, + "step": 7076 + }, + { + "epoch": 0.9002671415850401, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.146510362625122, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8337844014167786, + "num_tokens": 269968464.0, + "step": 7077 + }, + { + "epoch": 0.9003943518636306, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.750910758972168, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.84743332862854, + "num_tokens": 270011569.0, + "step": 7078 + }, + { + "epoch": 0.9005215621422211, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7783621549606323, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8576587438583374, + "num_tokens": 270050608.0, + "step": 7079 + }, + { + "epoch": 0.9006487724208116, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9144805669784546, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8468928337097168, + "num_tokens": 270096100.0, + "step": 7080 + }, + { + "epoch": 0.9007759826994021, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.675245761871338, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8725079298019409, + "num_tokens": 270140915.0, + "step": 7081 + }, + { + "epoch": 0.9009031929779926, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7452706098556519, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8614447116851807, + "num_tokens": 270184233.0, + "step": 7082 + }, + { + "epoch": 0.9010304032565831, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7699253559112549, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8595289587974548, + "num_tokens": 270223889.0, + "step": 7083 + }, + { + "epoch": 0.9011576135351737, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7847899198532104, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8661953210830688, + "num_tokens": 270266357.0, + "step": 7084 + }, + { + "epoch": 0.9012848238137642, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.986728310585022, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.869525134563446, + "num_tokens": 270298252.0, + "step": 7085 + }, + { + "epoch": 0.9014120340923547, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7521106004714966, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8539408445358276, + "num_tokens": 270343187.0, + "step": 7086 + }, + { + "epoch": 0.9015392443709451, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 16.628864288330078, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8498072028160095, + "num_tokens": 270385094.0, + "step": 7087 + }, + { + "epoch": 0.9016664546495357, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.1801254749298096, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.849688708782196, + "num_tokens": 270421772.0, + "step": 7088 + }, + { + "epoch": 0.9017936649281262, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9170596599578857, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.84332275390625, + "num_tokens": 270465033.0, + "step": 7089 + }, + { + "epoch": 0.9019208752067167, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8964329957962036, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8834044933319092, + "num_tokens": 270501370.0, + "step": 7090 + }, + { + "epoch": 0.9020480854853072, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7198832035064697, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8657000660896301, + "num_tokens": 270543571.0, + "step": 7091 + }, + { + "epoch": 0.9021752957638978, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.74778413772583, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8710206747055054, + "num_tokens": 270580635.0, + "step": 7092 + }, + { + "epoch": 0.9023025060424882, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.0977416038513184, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8740313053131104, + "num_tokens": 270616534.0, + "step": 7093 + }, + { + "epoch": 0.9024297163210787, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.975036382675171, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8421157002449036, + "num_tokens": 270653998.0, + "step": 7094 + }, + { + "epoch": 0.9025569265996692, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9706295728683472, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8612156510353088, + "num_tokens": 270688023.0, + "step": 7095 + }, + { + "epoch": 0.9026841368782598, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.1516826152801514, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8435882329940796, + "num_tokens": 270721298.0, + "step": 7096 + }, + { + "epoch": 0.9028113471568503, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9983328580856323, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8665424585342407, + "num_tokens": 270762182.0, + "step": 7097 + }, + { + "epoch": 0.9029385574354408, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8669618368148804, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8756965398788452, + "num_tokens": 270799577.0, + "step": 7098 + }, + { + "epoch": 0.9030657677140312, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7448197603225708, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8626903891563416, + "num_tokens": 270840218.0, + "step": 7099 + }, + { + "epoch": 0.9031929779926218, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9022150039672852, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8811535835266113, + "num_tokens": 270872982.0, + "step": 7100 + }, + { + "epoch": 0.9033201882712123, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9098942279815674, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8661707639694214, + "num_tokens": 270908381.0, + "step": 7101 + }, + { + "epoch": 0.9034473985498028, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.6985406875610352, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8603318929672241, + "num_tokens": 270949217.0, + "step": 7102 + }, + { + "epoch": 0.9035746088283934, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8373719453811646, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8441522121429443, + "num_tokens": 270989667.0, + "step": 7103 + }, + { + "epoch": 0.9037018191069839, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8618338108062744, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8513010144233704, + "num_tokens": 271030395.0, + "step": 7104 + }, + { + "epoch": 0.9038290293855743, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.011016845703125, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.844218373298645, + "num_tokens": 271068239.0, + "step": 7105 + }, + { + "epoch": 0.9039562396641648, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.976617455482483, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.854402482509613, + "num_tokens": 271110294.0, + "step": 7106 + }, + { + "epoch": 0.9040834499427554, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9422568082809448, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8786132335662842, + "num_tokens": 271143288.0, + "step": 7107 + }, + { + "epoch": 0.9042106602213459, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8479197025299072, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8678861856460571, + "num_tokens": 271181183.0, + "step": 7108 + }, + { + "epoch": 0.9043378704999364, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8398025035858154, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8595605492591858, + "num_tokens": 271216201.0, + "step": 7109 + }, + { + "epoch": 0.9044650807785269, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7463334798812866, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8671886920928955, + "num_tokens": 271250158.0, + "step": 7110 + }, + { + "epoch": 0.9045922910571175, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.716448426246643, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8734333515167236, + "num_tokens": 271289994.0, + "step": 7111 + }, + { + "epoch": 0.9047195013357079, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8317116498947144, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8669986724853516, + "num_tokens": 271325782.0, + "step": 7112 + }, + { + "epoch": 0.9048467116142984, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9701566696166992, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8481482267379761, + "num_tokens": 271361604.0, + "step": 7113 + }, + { + "epoch": 0.9049739218928889, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8399909734725952, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8712127804756165, + "num_tokens": 271398520.0, + "step": 7114 + }, + { + "epoch": 0.9051011321714795, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.863500714302063, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8558552861213684, + "num_tokens": 271445138.0, + "step": 7115 + }, + { + "epoch": 0.90522834245007, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.850484848022461, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8661496639251709, + "num_tokens": 271479147.0, + "step": 7116 + }, + { + "epoch": 0.9053555527286605, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9947843551635742, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8636717796325684, + "num_tokens": 271512477.0, + "step": 7117 + }, + { + "epoch": 0.905482763007251, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.916530966758728, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8490836024284363, + "num_tokens": 271552355.0, + "step": 7118 + }, + { + "epoch": 0.9056099732858415, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.962951898574829, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8580456972122192, + "num_tokens": 271595825.0, + "step": 7119 + }, + { + "epoch": 0.905737183564432, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.115004301071167, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8497663736343384, + "num_tokens": 271627384.0, + "step": 7120 + }, + { + "epoch": 0.9058643938430225, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 16.618762969970703, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8703799247741699, + "num_tokens": 271666711.0, + "step": 7121 + }, + { + "epoch": 0.9059916041216131, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.956670880317688, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8672918081283569, + "num_tokens": 271703145.0, + "step": 7122 + }, + { + "epoch": 0.9061188144002036, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9863497018814087, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8501388430595398, + "num_tokens": 271743131.0, + "step": 7123 + }, + { + "epoch": 0.906246024678794, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.921246886253357, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8658900856971741, + "num_tokens": 271778245.0, + "step": 7124 + }, + { + "epoch": 0.9063732349573845, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7525699138641357, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8526950478553772, + "num_tokens": 271817279.0, + "step": 7125 + }, + { + "epoch": 0.9065004452359751, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.5122547149658203, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.86868816614151, + "num_tokens": 271857136.0, + "step": 7126 + }, + { + "epoch": 0.9066276555145656, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0493719577789307, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8698405027389526, + "num_tokens": 271891651.0, + "step": 7127 + }, + { + "epoch": 0.9067548657931561, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8836338520050049, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8658866286277771, + "num_tokens": 271925805.0, + "step": 7128 + }, + { + "epoch": 0.9068820760717466, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8142237663269043, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8665732741355896, + "num_tokens": 271964954.0, + "step": 7129 + }, + { + "epoch": 0.9070092863503371, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.2190792560577393, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8609288930892944, + "num_tokens": 272000331.0, + "step": 7130 + }, + { + "epoch": 0.9071364966289276, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8817481994628906, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8674757480621338, + "num_tokens": 272037853.0, + "step": 7131 + }, + { + "epoch": 0.9072637069075181, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.771321415901184, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8722771406173706, + "num_tokens": 272077663.0, + "step": 7132 + }, + { + "epoch": 0.9073909171861086, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7619102001190186, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8564603328704834, + "num_tokens": 272116942.0, + "step": 7133 + }, + { + "epoch": 0.9075181274646992, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9098869562149048, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.854572594165802, + "num_tokens": 272151319.0, + "step": 7134 + }, + { + "epoch": 0.9076453377432897, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8518930673599243, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.848760724067688, + "num_tokens": 272190817.0, + "step": 7135 + }, + { + "epoch": 0.9077725480218801, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8275636434555054, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8566293716430664, + "num_tokens": 272230461.0, + "step": 7136 + }, + { + "epoch": 0.9078997583004706, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8179771900177002, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8564559817314148, + "num_tokens": 272268510.0, + "step": 7137 + }, + { + "epoch": 0.9080269685790612, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7953150272369385, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8611640930175781, + "num_tokens": 272312430.0, + "step": 7138 + }, + { + "epoch": 0.9081541788576517, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0319371223449707, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.860548734664917, + "num_tokens": 272348437.0, + "step": 7139 + }, + { + "epoch": 0.9082813891362422, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0437941551208496, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8533622026443481, + "num_tokens": 272378902.0, + "step": 7140 + }, + { + "epoch": 0.9084085994148328, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7901055812835693, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8733936548233032, + "num_tokens": 272421257.0, + "step": 7141 + }, + { + "epoch": 0.9085358096934232, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9080843925476074, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.860392153263092, + "num_tokens": 272459800.0, + "step": 7142 + }, + { + "epoch": 0.9086630199720137, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.750269889831543, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8810645341873169, + "num_tokens": 272501245.0, + "step": 7143 + }, + { + "epoch": 0.9087902302506042, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0585038661956787, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8632650375366211, + "num_tokens": 272535873.0, + "step": 7144 + }, + { + "epoch": 0.9089174405291948, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8997936248779297, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8599317073822021, + "num_tokens": 272575213.0, + "step": 7145 + }, + { + "epoch": 0.9090446508077853, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8757174015045166, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8644084930419922, + "num_tokens": 272611960.0, + "step": 7146 + }, + { + "epoch": 0.9091718610863758, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8653734922409058, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.861810564994812, + "num_tokens": 272650895.0, + "step": 7147 + }, + { + "epoch": 0.9092990713649662, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7656980752944946, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8623229265213013, + "num_tokens": 272690617.0, + "step": 7148 + }, + { + "epoch": 0.9094262816435568, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9112069606781006, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8530599474906921, + "num_tokens": 272732526.0, + "step": 7149 + }, + { + "epoch": 0.9095534919221473, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8666388988494873, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8596460819244385, + "num_tokens": 272769882.0, + "step": 7150 + }, + { + "epoch": 0.9096807022007378, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8132661581039429, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8643324971199036, + "num_tokens": 272809614.0, + "step": 7151 + }, + { + "epoch": 0.9098079124793284, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.982835292816162, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8542442321777344, + "num_tokens": 272844819.0, + "step": 7152 + }, + { + "epoch": 0.9099351227579189, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.7655549049377441, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8670518398284912, + "num_tokens": 272882798.0, + "step": 7153 + }, + { + "epoch": 0.9100623330365093, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8303296566009521, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8634238839149475, + "num_tokens": 272917472.0, + "step": 7154 + }, + { + "epoch": 0.9101895433150998, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7227181196212769, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8500997424125671, + "num_tokens": 272959947.0, + "step": 7155 + }, + { + "epoch": 0.9103167535936904, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9435029029846191, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8449212312698364, + "num_tokens": 272998194.0, + "step": 7156 + }, + { + "epoch": 0.9104439638722809, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9562981128692627, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8793169260025024, + "num_tokens": 273034688.0, + "step": 7157 + }, + { + "epoch": 0.9105711741508714, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.8393990993499756, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8716335892677307, + "num_tokens": 273071575.0, + "step": 7158 + }, + { + "epoch": 0.9106983844294619, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0183229446411133, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8363566398620605, + "num_tokens": 273106106.0, + "step": 7159 + }, + { + "epoch": 0.9108255947080524, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8340898752212524, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8561842441558838, + "num_tokens": 273142959.0, + "step": 7160 + }, + { + "epoch": 0.9109528049866429, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8660578727722168, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.846021294593811, + "num_tokens": 273182781.0, + "step": 7161 + }, + { + "epoch": 0.9110800152652334, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7420216798782349, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8680707812309265, + "num_tokens": 273225738.0, + "step": 7162 + }, + { + "epoch": 0.9112072255438239, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7523173093795776, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8684288263320923, + "num_tokens": 273269166.0, + "step": 7163 + }, + { + "epoch": 0.9113344358224145, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.891946792602539, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8558346033096313, + "num_tokens": 273309093.0, + "step": 7164 + }, + { + "epoch": 0.911461646101005, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8593467473983765, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8669975996017456, + "num_tokens": 273346857.0, + "step": 7165 + }, + { + "epoch": 0.9115888563795955, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.822038173675537, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.856688380241394, + "num_tokens": 273385720.0, + "step": 7166 + }, + { + "epoch": 0.9117160666581859, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7294678688049316, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8617570996284485, + "num_tokens": 273431079.0, + "step": 7167 + }, + { + "epoch": 0.9118432769367765, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7954368591308594, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8623225688934326, + "num_tokens": 273477316.0, + "step": 7168 + }, + { + "epoch": 0.911970487215367, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0836591720581055, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8654146194458008, + "num_tokens": 273512831.0, + "step": 7169 + }, + { + "epoch": 0.9120976974939575, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9742015600204468, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8557142019271851, + "num_tokens": 273551927.0, + "step": 7170 + }, + { + "epoch": 0.912224907772548, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.9805421829223633, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8661333918571472, + "num_tokens": 273583053.0, + "step": 7171 + }, + { + "epoch": 0.9123521180511386, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.562546491622925, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8513014912605286, + "num_tokens": 273617953.0, + "step": 7172 + }, + { + "epoch": 0.912479328329729, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9976401329040527, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8471960425376892, + "num_tokens": 273649959.0, + "step": 7173 + }, + { + "epoch": 0.9126065386083195, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0742805004119873, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8514642715454102, + "num_tokens": 273682666.0, + "step": 7174 + }, + { + "epoch": 0.9127337488869101, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.2303152084350586, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8803101778030396, + "num_tokens": 273718641.0, + "step": 7175 + }, + { + "epoch": 0.9128609591655006, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7812319993972778, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8712864518165588, + "num_tokens": 273760502.0, + "step": 7176 + }, + { + "epoch": 0.9129881694440911, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8947995901107788, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8557776212692261, + "num_tokens": 273795068.0, + "step": 7177 + }, + { + "epoch": 0.9131153797226816, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.1270461082458496, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8440112471580505, + "num_tokens": 273825099.0, + "step": 7178 + }, + { + "epoch": 0.9132425900012721, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8535678386688232, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8666845560073853, + "num_tokens": 273862566.0, + "step": 7179 + }, + { + "epoch": 0.9133698002798626, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.030345916748047, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8616793155670166, + "num_tokens": 273896107.0, + "step": 7180 + }, + { + "epoch": 0.9134970105584531, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9806288480758667, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8652685880661011, + "num_tokens": 273934066.0, + "step": 7181 + }, + { + "epoch": 0.9136242208370436, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.965147852897644, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8606942296028137, + "num_tokens": 273971843.0, + "step": 7182 + }, + { + "epoch": 0.9137514311156342, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.1885414123535156, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8529097437858582, + "num_tokens": 274001646.0, + "step": 7183 + }, + { + "epoch": 0.9138786413942247, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9173036813735962, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8465614914894104, + "num_tokens": 274042656.0, + "step": 7184 + }, + { + "epoch": 0.9140058516728151, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8105813264846802, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8475200533866882, + "num_tokens": 274084609.0, + "step": 7185 + }, + { + "epoch": 0.9141330619514056, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.820296049118042, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8648147583007812, + "num_tokens": 274127503.0, + "step": 7186 + }, + { + "epoch": 0.9142602722299962, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9451371431350708, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8657238483428955, + "num_tokens": 274164126.0, + "step": 7187 + }, + { + "epoch": 0.9143874825085867, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.170894145965576, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8614025115966797, + "num_tokens": 274198104.0, + "step": 7188 + }, + { + "epoch": 0.9145146927871772, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9195810556411743, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8492462038993835, + "num_tokens": 274233487.0, + "step": 7189 + }, + { + "epoch": 0.9146419030657678, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0433108806610107, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8649042248725891, + "num_tokens": 274267317.0, + "step": 7190 + }, + { + "epoch": 0.9147691133443582, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8894392251968384, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.853622555732727, + "num_tokens": 274304169.0, + "step": 7191 + }, + { + "epoch": 0.9148963236229487, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8678339719772339, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8612339496612549, + "num_tokens": 274343265.0, + "step": 7192 + }, + { + "epoch": 0.9150235339015392, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9410761594772339, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8449640870094299, + "num_tokens": 274380756.0, + "step": 7193 + }, + { + "epoch": 0.9151507441801298, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8081474304199219, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8709405660629272, + "num_tokens": 274422035.0, + "step": 7194 + }, + { + "epoch": 0.9152779544587203, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9421722888946533, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.851335883140564, + "num_tokens": 274460875.0, + "step": 7195 + }, + { + "epoch": 0.9154051647373108, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.868170738220215, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8634469509124756, + "num_tokens": 274495807.0, + "step": 7196 + }, + { + "epoch": 0.9155323750159012, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.5856001377105713, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8703023791313171, + "num_tokens": 274533098.0, + "step": 7197 + }, + { + "epoch": 0.9156595852944918, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.281391143798828, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8554106950759888, + "num_tokens": 274565282.0, + "step": 7198 + }, + { + "epoch": 0.9157867955730823, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.852744221687317, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8641388416290283, + "num_tokens": 274605401.0, + "step": 7199 + }, + { + "epoch": 0.9159140058516728, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.1394944190979004, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.855183482170105, + "num_tokens": 274645018.0, + "step": 7200 + }, + { + "epoch": 0.9160412161302633, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8490381240844727, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.868935763835907, + "num_tokens": 274681541.0, + "step": 7201 + }, + { + "epoch": 0.9161684264088539, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9532968997955322, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8641366958618164, + "num_tokens": 274716883.0, + "step": 7202 + }, + { + "epoch": 0.9162956366874443, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8898165225982666, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8558786511421204, + "num_tokens": 274753038.0, + "step": 7203 + }, + { + "epoch": 0.9164228469660348, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.971191644668579, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8675975203514099, + "num_tokens": 274790760.0, + "step": 7204 + }, + { + "epoch": 0.9165500572446253, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8566563129425049, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8541632890701294, + "num_tokens": 274830872.0, + "step": 7205 + }, + { + "epoch": 0.9166772675232159, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8007698059082031, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8485966324806213, + "num_tokens": 274869832.0, + "step": 7206 + }, + { + "epoch": 0.9168044778018064, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.2182657718658447, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8528033494949341, + "num_tokens": 274903853.0, + "step": 7207 + }, + { + "epoch": 0.9169316880803969, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8479081392288208, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8590170741081238, + "num_tokens": 274942911.0, + "step": 7208 + }, + { + "epoch": 0.9170588983589874, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7646650075912476, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8679855465888977, + "num_tokens": 274986427.0, + "step": 7209 + }, + { + "epoch": 0.9171861086375779, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9318681955337524, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8634023666381836, + "num_tokens": 275021557.0, + "step": 7210 + }, + { + "epoch": 0.9173133189161684, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8781386613845825, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8502563238143921, + "num_tokens": 275061308.0, + "step": 7211 + }, + { + "epoch": 0.9174405291947589, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8467445373535156, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8547836542129517, + "num_tokens": 275101497.0, + "step": 7212 + }, + { + "epoch": 0.9175677394733495, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.020418882369995, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8500565886497498, + "num_tokens": 275138317.0, + "step": 7213 + }, + { + "epoch": 0.91769494975194, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7952181100845337, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.860180139541626, + "num_tokens": 275180119.0, + "step": 7214 + }, + { + "epoch": 0.9178221600305305, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.035520553588867, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8589447736740112, + "num_tokens": 275212083.0, + "step": 7215 + }, + { + "epoch": 0.9179493703091209, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.118864059448242, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.850124716758728, + "num_tokens": 275247872.0, + "step": 7216 + }, + { + "epoch": 0.9180765805877115, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0187532901763916, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.864841103553772, + "num_tokens": 275287022.0, + "step": 7217 + }, + { + "epoch": 0.918203790866302, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.018758535385132, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8632696270942688, + "num_tokens": 275322259.0, + "step": 7218 + }, + { + "epoch": 0.9183310011448925, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8311891555786133, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8737507462501526, + "num_tokens": 275357410.0, + "step": 7219 + }, + { + "epoch": 0.918458211423483, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9962701797485352, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8724817037582397, + "num_tokens": 275388720.0, + "step": 7220 + }, + { + "epoch": 0.9185854217020736, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8811269998550415, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.866816520690918, + "num_tokens": 275427699.0, + "step": 7221 + }, + { + "epoch": 0.918712631980664, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7554272413253784, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8644462823867798, + "num_tokens": 275472220.0, + "step": 7222 + }, + { + "epoch": 0.9188398422592545, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.92622709274292, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8508015871047974, + "num_tokens": 275506693.0, + "step": 7223 + }, + { + "epoch": 0.918967052537845, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.820683479309082, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8648281097412109, + "num_tokens": 275547295.0, + "step": 7224 + }, + { + "epoch": 0.9190942628164356, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9963370561599731, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8580185174942017, + "num_tokens": 275591552.0, + "step": 7225 + }, + { + "epoch": 0.9192214730950261, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8378499746322632, + "learning_rate": 1e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8351200819015503, + "num_tokens": 275632197.0, + "step": 7226 + }, + { + "epoch": 0.9193486833736166, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9065899848937988, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.853885293006897, + "num_tokens": 275670680.0, + "step": 7227 + }, + { + "epoch": 0.919475893652207, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9062435626983643, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8585307002067566, + "num_tokens": 275709315.0, + "step": 7228 + }, + { + "epoch": 0.9196031039307976, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.123342514038086, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8581329584121704, + "num_tokens": 275740405.0, + "step": 7229 + }, + { + "epoch": 0.9197303142093881, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7528420686721802, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8780772686004639, + "num_tokens": 275780915.0, + "step": 7230 + }, + { + "epoch": 0.9198575244879786, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8738949298858643, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8554511666297913, + "num_tokens": 275817832.0, + "step": 7231 + }, + { + "epoch": 0.9199847347665692, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7760100364685059, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8640005588531494, + "num_tokens": 275857621.0, + "step": 7232 + }, + { + "epoch": 0.9201119450451597, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8650226593017578, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8570868968963623, + "num_tokens": 275898917.0, + "step": 7233 + }, + { + "epoch": 0.9202391553237501, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.053313732147217, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8647416234016418, + "num_tokens": 275930151.0, + "step": 7234 + }, + { + "epoch": 0.9203663656023406, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.856915831565857, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8673626184463501, + "num_tokens": 275966901.0, + "step": 7235 + }, + { + "epoch": 0.9204935758809312, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.766913652420044, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8828461170196533, + "num_tokens": 276002845.0, + "step": 7236 + }, + { + "epoch": 0.9206207861595217, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9497524499893188, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8585293292999268, + "num_tokens": 276038476.0, + "step": 7237 + }, + { + "epoch": 0.9207479964381122, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8094449043273926, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8558545112609863, + "num_tokens": 276077867.0, + "step": 7238 + }, + { + "epoch": 0.9208752067167028, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8954731225967407, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8511655926704407, + "num_tokens": 276123289.0, + "step": 7239 + }, + { + "epoch": 0.9210024169952932, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0660111904144287, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.848680853843689, + "num_tokens": 276157357.0, + "step": 7240 + }, + { + "epoch": 0.9211296272738837, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7615028619766235, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8819063305854797, + "num_tokens": 276195548.0, + "step": 7241 + }, + { + "epoch": 0.9212568375524742, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8229268789291382, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8627796173095703, + "num_tokens": 276233533.0, + "step": 7242 + }, + { + "epoch": 0.9213840478310648, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 3.9947400093078613, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8635045289993286, + "num_tokens": 276274310.0, + "step": 7243 + }, + { + "epoch": 0.9215112581096553, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0321335792541504, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8593717813491821, + "num_tokens": 276310272.0, + "step": 7244 + }, + { + "epoch": 0.9216384683882458, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.031597137451172, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8547104597091675, + "num_tokens": 276350334.0, + "step": 7245 + }, + { + "epoch": 0.9217656786668362, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.5608930587768555, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8564106225967407, + "num_tokens": 276393913.0, + "step": 7246 + }, + { + "epoch": 0.9218928889454268, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8770970106124878, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8562349081039429, + "num_tokens": 276430368.0, + "step": 7247 + }, + { + "epoch": 0.9220200992240173, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7496789693832397, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8547030687332153, + "num_tokens": 276470930.0, + "step": 7248 + }, + { + "epoch": 0.9221473095026078, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9298570156097412, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8650691509246826, + "num_tokens": 276509359.0, + "step": 7249 + }, + { + "epoch": 0.9222745197811983, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.79965078830719, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8519002199172974, + "num_tokens": 276546980.0, + "step": 7250 + }, + { + "epoch": 0.9224017300597889, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9425902366638184, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8476974964141846, + "num_tokens": 276586212.0, + "step": 7251 + }, + { + "epoch": 0.9225289403383793, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7986379861831665, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8613561987876892, + "num_tokens": 276623946.0, + "step": 7252 + }, + { + "epoch": 0.9226561506169698, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8735122680664062, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8560175895690918, + "num_tokens": 276667567.0, + "step": 7253 + }, + { + "epoch": 0.9227833608955603, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8348771333694458, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8533926010131836, + "num_tokens": 276708479.0, + "step": 7254 + }, + { + "epoch": 0.9229105711741509, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8693456649780273, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8553356528282166, + "num_tokens": 276744627.0, + "step": 7255 + }, + { + "epoch": 0.9230377814527414, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7545539140701294, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8606675863265991, + "num_tokens": 276782830.0, + "step": 7256 + }, + { + "epoch": 0.9231649917313319, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8546779155731201, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8588199615478516, + "num_tokens": 276825745.0, + "step": 7257 + }, + { + "epoch": 0.9232922020099223, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.821333408355713, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8657644987106323, + "num_tokens": 276866743.0, + "step": 7258 + }, + { + "epoch": 0.9234194122885129, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.813205361366272, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8564149141311646, + "num_tokens": 276903884.0, + "step": 7259 + }, + { + "epoch": 0.9235466225671034, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9747881889343262, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.866456925868988, + "num_tokens": 276939142.0, + "step": 7260 + }, + { + "epoch": 0.9236738328456939, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9589438438415527, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8607133626937866, + "num_tokens": 276979041.0, + "step": 7261 + }, + { + "epoch": 0.9238010431242845, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8819236755371094, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8734666109085083, + "num_tokens": 277015590.0, + "step": 7262 + }, + { + "epoch": 0.923928253402875, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.152345657348633, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8532317876815796, + "num_tokens": 277055745.0, + "step": 7263 + }, + { + "epoch": 0.9240554636814655, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8902901411056519, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8551315069198608, + "num_tokens": 277098856.0, + "step": 7264 + }, + { + "epoch": 0.9241826739600559, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9118947982788086, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8715757131576538, + "num_tokens": 277139165.0, + "step": 7265 + }, + { + "epoch": 0.9243098842386465, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9325748682022095, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8644864559173584, + "num_tokens": 277171879.0, + "step": 7266 + }, + { + "epoch": 0.924437094517237, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.810203194618225, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8610094785690308, + "num_tokens": 277211719.0, + "step": 7267 + }, + { + "epoch": 0.9245643047958275, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8528841733932495, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.851353645324707, + "num_tokens": 277252675.0, + "step": 7268 + }, + { + "epoch": 0.924691515074418, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9701685905456543, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.847667932510376, + "num_tokens": 277287529.0, + "step": 7269 + }, + { + "epoch": 0.9248187253530086, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9227133989334106, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8593941926956177, + "num_tokens": 277328392.0, + "step": 7270 + }, + { + "epoch": 0.924945935631599, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8841793537139893, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8699977993965149, + "num_tokens": 277368412.0, + "step": 7271 + }, + { + "epoch": 0.9250731459101895, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8801406621932983, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8657600283622742, + "num_tokens": 277404373.0, + "step": 7272 + }, + { + "epoch": 0.92520035618878, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8838603496551514, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8567484617233276, + "num_tokens": 277444988.0, + "step": 7273 + }, + { + "epoch": 0.9253275664673706, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.049774169921875, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8689237833023071, + "num_tokens": 277487501.0, + "step": 7274 + }, + { + "epoch": 0.9254547767459611, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.325899600982666, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8569144010543823, + "num_tokens": 277529056.0, + "step": 7275 + }, + { + "epoch": 0.9255819870245516, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9257631301879883, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8725888133049011, + "num_tokens": 277563022.0, + "step": 7276 + }, + { + "epoch": 0.925709197303142, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7581431865692139, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8669231534004211, + "num_tokens": 277603879.0, + "step": 7277 + }, + { + "epoch": 0.9258364075817326, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0006556510925293, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8557214736938477, + "num_tokens": 277643711.0, + "step": 7278 + }, + { + "epoch": 0.9259636178603231, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.94144606590271, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.86024010181427, + "num_tokens": 277680667.0, + "step": 7279 + }, + { + "epoch": 0.9260908281389136, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0535221099853516, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8575774431228638, + "num_tokens": 277715548.0, + "step": 7280 + }, + { + "epoch": 0.9262180384175042, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.2506930828094482, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8602453470230103, + "num_tokens": 277745287.0, + "step": 7281 + }, + { + "epoch": 0.9263452486960947, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8941477537155151, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8553389310836792, + "num_tokens": 277781484.0, + "step": 7282 + }, + { + "epoch": 0.9264724589746851, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 80.52117919921875, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.862440288066864, + "num_tokens": 277818429.0, + "step": 7283 + }, + { + "epoch": 0.9265996692532756, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.5730574131011963, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8689960241317749, + "num_tokens": 277853077.0, + "step": 7284 + }, + { + "epoch": 0.9267268795318662, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0697689056396484, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8777964115142822, + "num_tokens": 277893997.0, + "step": 7285 + }, + { + "epoch": 0.9268540898104567, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8774889707565308, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8500493764877319, + "num_tokens": 277934680.0, + "step": 7286 + }, + { + "epoch": 0.9269813000890472, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.6641783714294434, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.853055477142334, + "num_tokens": 277980370.0, + "step": 7287 + }, + { + "epoch": 0.9271085103676378, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7844818830490112, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8724031448364258, + "num_tokens": 278017257.0, + "step": 7288 + }, + { + "epoch": 0.9272357206462282, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9300594329833984, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.860862672328949, + "num_tokens": 278054835.0, + "step": 7289 + }, + { + "epoch": 0.9273629309248187, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8305163383483887, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8590461015701294, + "num_tokens": 278089510.0, + "step": 7290 + }, + { + "epoch": 0.9274901412034092, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8610018491744995, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8617423176765442, + "num_tokens": 278126179.0, + "step": 7291 + }, + { + "epoch": 0.9276173514819998, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.425614356994629, + "learning_rate": 1e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.8383012413978577, + "num_tokens": 278170837.0, + "step": 7292 + }, + { + "epoch": 0.9277445617605903, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.97711980342865, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8644770383834839, + "num_tokens": 278207410.0, + "step": 7293 + }, + { + "epoch": 0.9278717720391808, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.861836552619934, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8812931776046753, + "num_tokens": 278240495.0, + "step": 7294 + }, + { + "epoch": 0.9279989823177712, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8071973323822021, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8517292737960815, + "num_tokens": 278278449.0, + "step": 7295 + }, + { + "epoch": 0.9281261925963618, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.6450536251068115, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8701672554016113, + "num_tokens": 278321434.0, + "step": 7296 + }, + { + "epoch": 0.9282534028749523, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8567357063293457, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8560926914215088, + "num_tokens": 278363730.0, + "step": 7297 + }, + { + "epoch": 0.9283806131535428, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8432577848434448, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8617240190505981, + "num_tokens": 278395493.0, + "step": 7298 + }, + { + "epoch": 0.9285078234321333, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8550511598587036, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8629112243652344, + "num_tokens": 278433024.0, + "step": 7299 + }, + { + "epoch": 0.9286350337107239, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7593028545379639, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8617699146270752, + "num_tokens": 278471311.0, + "step": 7300 + }, + { + "epoch": 0.9287622439893143, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.278745174407959, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8795865178108215, + "num_tokens": 278509151.0, + "step": 7301 + }, + { + "epoch": 0.9288894542679048, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7854770421981812, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8680257201194763, + "num_tokens": 278548743.0, + "step": 7302 + }, + { + "epoch": 0.9290166645464953, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8106625080108643, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8719868659973145, + "num_tokens": 278590722.0, + "step": 7303 + }, + { + "epoch": 0.9291438748250859, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.1508913040161133, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8477263450622559, + "num_tokens": 278628150.0, + "step": 7304 + }, + { + "epoch": 0.9292710851036764, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.865331768989563, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8450995683670044, + "num_tokens": 278672543.0, + "step": 7305 + }, + { + "epoch": 0.9293982953822669, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.3308663368225098, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8691813945770264, + "num_tokens": 278707234.0, + "step": 7306 + }, + { + "epoch": 0.9295255056608573, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.136186361312866, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8617693185806274, + "num_tokens": 278734678.0, + "step": 7307 + }, + { + "epoch": 0.9296527159394479, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.734364628791809, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8695891499519348, + "num_tokens": 278778284.0, + "step": 7308 + }, + { + "epoch": 0.9297799262180384, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.852913498878479, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8604782819747925, + "num_tokens": 278815363.0, + "step": 7309 + }, + { + "epoch": 0.9299071364966289, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8669662475585938, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8489828109741211, + "num_tokens": 278860777.0, + "step": 7310 + }, + { + "epoch": 0.9300343467752195, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9444756507873535, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8740159273147583, + "num_tokens": 278890552.0, + "step": 7311 + }, + { + "epoch": 0.93016155705381, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.097027540206909, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8493517637252808, + "num_tokens": 278924741.0, + "step": 7312 + }, + { + "epoch": 0.9302887673324005, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9093849658966064, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8702661991119385, + "num_tokens": 278965185.0, + "step": 7313 + }, + { + "epoch": 0.9304159776109909, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.046055316925049, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8503111004829407, + "num_tokens": 278998961.0, + "step": 7314 + }, + { + "epoch": 0.9305431878895815, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.081057548522949, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8480587005615234, + "num_tokens": 279032689.0, + "step": 7315 + }, + { + "epoch": 0.930670398168172, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7571041584014893, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8480712175369263, + "num_tokens": 279080305.0, + "step": 7316 + }, + { + "epoch": 0.9307976084467625, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.6823166608810425, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.86174476146698, + "num_tokens": 279120581.0, + "step": 7317 + }, + { + "epoch": 0.930924818725353, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7754650115966797, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8731701970100403, + "num_tokens": 279165050.0, + "step": 7318 + }, + { + "epoch": 0.9310520290039436, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7045148611068726, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8650097250938416, + "num_tokens": 279209393.0, + "step": 7319 + }, + { + "epoch": 0.931179239282534, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.1200058460235596, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8484281301498413, + "num_tokens": 279250569.0, + "step": 7320 + }, + { + "epoch": 0.9313064495611245, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.70487380027771, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8671692609786987, + "num_tokens": 279291154.0, + "step": 7321 + }, + { + "epoch": 0.931433659839715, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9513710737228394, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8605769276618958, + "num_tokens": 279326967.0, + "step": 7322 + }, + { + "epoch": 0.9315608701183056, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8957018852233887, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8739213347434998, + "num_tokens": 279367129.0, + "step": 7323 + }, + { + "epoch": 0.9316880803968961, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8095418214797974, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8666795492172241, + "num_tokens": 279403651.0, + "step": 7324 + }, + { + "epoch": 0.9318152906754866, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9955594539642334, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8532233238220215, + "num_tokens": 279439166.0, + "step": 7325 + }, + { + "epoch": 0.931942500954077, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0839474201202393, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8597666025161743, + "num_tokens": 279473489.0, + "step": 7326 + }, + { + "epoch": 0.9320697112326676, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.1147212982177734, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8647654056549072, + "num_tokens": 279511709.0, + "step": 7327 + }, + { + "epoch": 0.9321969215112581, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9006719589233398, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8486178517341614, + "num_tokens": 279551638.0, + "step": 7328 + }, + { + "epoch": 0.9323241317898486, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.873734712600708, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8725366592407227, + "num_tokens": 279587304.0, + "step": 7329 + }, + { + "epoch": 0.9324513420684392, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.918177604675293, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8513320684432983, + "num_tokens": 279628138.0, + "step": 7330 + }, + { + "epoch": 0.9325785523470297, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9280283451080322, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8648698329925537, + "num_tokens": 279663064.0, + "step": 7331 + }, + { + "epoch": 0.9327057626256201, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9568511247634888, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8525882959365845, + "num_tokens": 279704937.0, + "step": 7332 + }, + { + "epoch": 0.9328329729042106, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0188138484954834, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8634627461433411, + "num_tokens": 279738215.0, + "step": 7333 + }, + { + "epoch": 0.9329601831828012, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9447232484817505, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8457373380661011, + "num_tokens": 279771996.0, + "step": 7334 + }, + { + "epoch": 0.9330873934613917, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7864118814468384, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8750747442245483, + "num_tokens": 279810037.0, + "step": 7335 + }, + { + "epoch": 0.9332146037399822, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9752026796340942, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8542996644973755, + "num_tokens": 279847394.0, + "step": 7336 + }, + { + "epoch": 0.9333418140185727, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.1853833198547363, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8473021984100342, + "num_tokens": 279879057.0, + "step": 7337 + }, + { + "epoch": 0.9334690242971632, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.3927814960479736, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8545083999633789, + "num_tokens": 279911775.0, + "step": 7338 + }, + { + "epoch": 0.9335962345757537, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.023358106613159, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.862310528755188, + "num_tokens": 279947661.0, + "step": 7339 + }, + { + "epoch": 0.9337234448543442, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.1271018981933594, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8615681529045105, + "num_tokens": 279977507.0, + "step": 7340 + }, + { + "epoch": 0.9338506551329347, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9604578018188477, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8564278483390808, + "num_tokens": 280016749.0, + "step": 7341 + }, + { + "epoch": 0.9339778654115253, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9393173456192017, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8623239398002625, + "num_tokens": 280056092.0, + "step": 7342 + }, + { + "epoch": 0.9341050756901158, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.907294511795044, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8735157251358032, + "num_tokens": 280089803.0, + "step": 7343 + }, + { + "epoch": 0.9342322859687062, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9250279664993286, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8498440980911255, + "num_tokens": 280126190.0, + "step": 7344 + }, + { + "epoch": 0.9343594962472968, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9039329290390015, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8713033199310303, + "num_tokens": 280162743.0, + "step": 7345 + }, + { + "epoch": 0.9344867065258873, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7552047967910767, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8597114682197571, + "num_tokens": 280203676.0, + "step": 7346 + }, + { + "epoch": 0.9346139168044778, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.827057957649231, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8770142793655396, + "num_tokens": 280241765.0, + "step": 7347 + }, + { + "epoch": 0.9347411270830683, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9580336809158325, + "learning_rate": 1e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8400774598121643, + "num_tokens": 280280103.0, + "step": 7348 + }, + { + "epoch": 0.9348683373616589, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9414950609207153, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8443909287452698, + "num_tokens": 280314674.0, + "step": 7349 + }, + { + "epoch": 0.9349955476402493, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7387363910675049, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8626042008399963, + "num_tokens": 280353890.0, + "step": 7350 + }, + { + "epoch": 0.9351227579188398, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.899322509765625, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8529869318008423, + "num_tokens": 280396403.0, + "step": 7351 + }, + { + "epoch": 0.9352499681974303, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9650126695632935, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8615918159484863, + "num_tokens": 280436154.0, + "step": 7352 + }, + { + "epoch": 0.9353771784760209, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.3711400032043457, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8671579360961914, + "num_tokens": 280476480.0, + "step": 7353 + }, + { + "epoch": 0.9355043887546114, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.095165967941284, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8648011684417725, + "num_tokens": 280505976.0, + "step": 7354 + }, + { + "epoch": 0.9356315990332019, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9048038721084595, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8576643466949463, + "num_tokens": 280548365.0, + "step": 7355 + }, + { + "epoch": 0.9357588093117923, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8323884010314941, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8788707256317139, + "num_tokens": 280588074.0, + "step": 7356 + }, + { + "epoch": 0.9358860195903829, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7712901830673218, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8709914684295654, + "num_tokens": 280626915.0, + "step": 7357 + }, + { + "epoch": 0.9360132298689734, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9534960985183716, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8566753268241882, + "num_tokens": 280660088.0, + "step": 7358 + }, + { + "epoch": 0.9361404401475639, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8647711277008057, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8593987822532654, + "num_tokens": 280701326.0, + "step": 7359 + }, + { + "epoch": 0.9362676504261545, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.871174931526184, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.870389461517334, + "num_tokens": 280740256.0, + "step": 7360 + }, + { + "epoch": 0.936394860704745, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0619733333587646, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8744204044342041, + "num_tokens": 280773515.0, + "step": 7361 + }, + { + "epoch": 0.9365220709833355, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9643783569335938, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8362185955047607, + "num_tokens": 280817207.0, + "step": 7362 + }, + { + "epoch": 0.9366492812619259, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8763221502304077, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8473949432373047, + "num_tokens": 280860253.0, + "step": 7363 + }, + { + "epoch": 0.9367764915405165, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.245522975921631, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8578175902366638, + "num_tokens": 280892634.0, + "step": 7364 + }, + { + "epoch": 0.936903701819107, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8394441604614258, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8697164058685303, + "num_tokens": 280930912.0, + "step": 7365 + }, + { + "epoch": 0.9370309120976975, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8867019414901733, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8639228343963623, + "num_tokens": 280971588.0, + "step": 7366 + }, + { + "epoch": 0.937158122376288, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9863320589065552, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8691295385360718, + "num_tokens": 281009898.0, + "step": 7367 + }, + { + "epoch": 0.9372853326548786, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9673254489898682, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8529099225997925, + "num_tokens": 281046726.0, + "step": 7368 + }, + { + "epoch": 0.937412542933469, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9465441703796387, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8747918605804443, + "num_tokens": 281085163.0, + "step": 7369 + }, + { + "epoch": 0.9375397532120595, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.915582299232483, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8674800395965576, + "num_tokens": 281128660.0, + "step": 7370 + }, + { + "epoch": 0.93766696349065, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9877256155014038, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.842556357383728, + "num_tokens": 281163780.0, + "step": 7371 + }, + { + "epoch": 0.9377941737692406, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8851377964019775, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8693642020225525, + "num_tokens": 281204247.0, + "step": 7372 + }, + { + "epoch": 0.9379213840478311, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9159280061721802, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8847255706787109, + "num_tokens": 281239518.0, + "step": 7373 + }, + { + "epoch": 0.9380485943264216, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8318119049072266, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8693283200263977, + "num_tokens": 281275916.0, + "step": 7374 + }, + { + "epoch": 0.938175804605012, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.944311261177063, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8708947896957397, + "num_tokens": 281307615.0, + "step": 7375 + }, + { + "epoch": 0.9383030148836026, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9221278429031372, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8662116527557373, + "num_tokens": 281342677.0, + "step": 7376 + }, + { + "epoch": 0.9384302251621931, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.105320692062378, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.856874942779541, + "num_tokens": 281379545.0, + "step": 7377 + }, + { + "epoch": 0.9385574354407836, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8061866760253906, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.842960000038147, + "num_tokens": 281424169.0, + "step": 7378 + }, + { + "epoch": 0.9386846457193742, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.718442678451538, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8810870051383972, + "num_tokens": 281461306.0, + "step": 7379 + }, + { + "epoch": 0.9388118559979647, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9504598379135132, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8755509853363037, + "num_tokens": 281498753.0, + "step": 7380 + }, + { + "epoch": 0.9389390662765551, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7624400854110718, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8710097074508667, + "num_tokens": 281537163.0, + "step": 7381 + }, + { + "epoch": 0.9390662765551456, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.960906982421875, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.853018045425415, + "num_tokens": 281577926.0, + "step": 7382 + }, + { + "epoch": 0.9391934868337362, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.890038251876831, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8805866837501526, + "num_tokens": 281621283.0, + "step": 7383 + }, + { + "epoch": 0.9393206971123267, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8098556995391846, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8648358583450317, + "num_tokens": 281658336.0, + "step": 7384 + }, + { + "epoch": 0.9394479073909172, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7864062786102295, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8657097220420837, + "num_tokens": 281695137.0, + "step": 7385 + }, + { + "epoch": 0.9395751176695077, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.7069127559661865, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.876372218132019, + "num_tokens": 281733570.0, + "step": 7386 + }, + { + "epoch": 0.9397023279480982, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.8390673398971558, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8633920550346375, + "num_tokens": 281771163.0, + "step": 7387 + }, + { + "epoch": 0.9398295382266887, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.807989478111267, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.863972544670105, + "num_tokens": 281813715.0, + "step": 7388 + }, + { + "epoch": 0.9399567485052792, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.82883882522583, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8590048551559448, + "num_tokens": 281856921.0, + "step": 7389 + }, + { + "epoch": 0.9400839587838697, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.934383511543274, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8522075414657593, + "num_tokens": 281893273.0, + "step": 7390 + }, + { + "epoch": 0.9402111690624603, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9088407754898071, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8661664128303528, + "num_tokens": 281927452.0, + "step": 7391 + }, + { + "epoch": 0.9403383793410508, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.6316399574279785, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8868535757064819, + "num_tokens": 281965359.0, + "step": 7392 + }, + { + "epoch": 0.9404655896196412, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 2.0972959995269775, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8593378067016602, + "num_tokens": 281999209.0, + "step": 7393 + }, + { + "epoch": 0.9405927998982317, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9318641424179077, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8522768020629883, + "num_tokens": 282042129.0, + "step": 7394 + }, + { + "epoch": 0.9407200101768223, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.9131273031234741, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8525189757347107, + "num_tokens": 282087941.0, + "step": 7395 + }, + { + "epoch": 0.9408472204554128, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9546464681625366, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8660937547683716, + "num_tokens": 282123130.0, + "step": 7396 + }, + { + "epoch": 0.9409744307340033, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0062482357025146, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8583016991615295, + "num_tokens": 282161142.0, + "step": 7397 + }, + { + "epoch": 0.9411016410125939, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7226879596710205, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8839480876922607, + "num_tokens": 282196363.0, + "step": 7398 + }, + { + "epoch": 0.9412288512911843, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.954958200454712, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8421404957771301, + "num_tokens": 282233202.0, + "step": 7399 + }, + { + "epoch": 0.9413560615697748, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8703148365020752, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.862947940826416, + "num_tokens": 282270976.0, + "step": 7400 + }, + { + "epoch": 0.9414832718483653, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8406212329864502, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8722231388092041, + "num_tokens": 282312223.0, + "step": 7401 + }, + { + "epoch": 0.9416104821269559, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8886117935180664, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8452236652374268, + "num_tokens": 282350418.0, + "step": 7402 + }, + { + "epoch": 0.9417376924055464, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0085861682891846, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8630165457725525, + "num_tokens": 282380751.0, + "step": 7403 + }, + { + "epoch": 0.9418649026841369, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.027522325515747, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8739877939224243, + "num_tokens": 282420248.0, + "step": 7404 + }, + { + "epoch": 0.9419921129627273, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.94180428981781, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8664727210998535, + "num_tokens": 282457951.0, + "step": 7405 + }, + { + "epoch": 0.9421193232413179, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9986586570739746, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8616033792495728, + "num_tokens": 282507379.0, + "step": 7406 + }, + { + "epoch": 0.9422465335199084, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9555506706237793, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8497762680053711, + "num_tokens": 282546905.0, + "step": 7407 + }, + { + "epoch": 0.9423737437984989, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7862011194229126, + "learning_rate": 1e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8332239389419556, + "num_tokens": 282591368.0, + "step": 7408 + }, + { + "epoch": 0.9425009540770894, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.08389949798584, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8598979711532593, + "num_tokens": 282617331.0, + "step": 7409 + }, + { + "epoch": 0.94262816435568, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.897161602973938, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8748503923416138, + "num_tokens": 282652346.0, + "step": 7410 + }, + { + "epoch": 0.9427553746342705, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7813692092895508, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8663263320922852, + "num_tokens": 282691564.0, + "step": 7411 + }, + { + "epoch": 0.9428825849128609, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0993354320526123, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8725675344467163, + "num_tokens": 282728186.0, + "step": 7412 + }, + { + "epoch": 0.9430097951914514, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.322195053100586, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8565870523452759, + "num_tokens": 282770491.0, + "step": 7413 + }, + { + "epoch": 0.943137005470042, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.857730746269226, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8554330468177795, + "num_tokens": 282809824.0, + "step": 7414 + }, + { + "epoch": 0.9432642157486325, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8744970560073853, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8673861622810364, + "num_tokens": 282846735.0, + "step": 7415 + }, + { + "epoch": 0.943391426027223, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7992007732391357, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.880868673324585, + "num_tokens": 282882242.0, + "step": 7416 + }, + { + "epoch": 0.9435186363058136, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.954113483428955, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8506113290786743, + "num_tokens": 282918774.0, + "step": 7417 + }, + { + "epoch": 0.943645846584404, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9747813940048218, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8548058271408081, + "num_tokens": 282957766.0, + "step": 7418 + }, + { + "epoch": 0.9437730568629945, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.982219934463501, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8499714136123657, + "num_tokens": 282995980.0, + "step": 7419 + }, + { + "epoch": 0.943900267141585, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8016295433044434, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8623284697532654, + "num_tokens": 283033148.0, + "step": 7420 + }, + { + "epoch": 0.9440274774201756, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.761942982673645, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8666857481002808, + "num_tokens": 283069781.0, + "step": 7421 + }, + { + "epoch": 0.9441546876987661, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0724523067474365, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8549323081970215, + "num_tokens": 283107270.0, + "step": 7422 + }, + { + "epoch": 0.9442818979773566, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8782687187194824, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8738144040107727, + "num_tokens": 283143571.0, + "step": 7423 + }, + { + "epoch": 0.944409108255947, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8613786697387695, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8666536808013916, + "num_tokens": 283179995.0, + "step": 7424 + }, + { + "epoch": 0.9445363185345376, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9110509157180786, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8613921403884888, + "num_tokens": 283218370.0, + "step": 7425 + }, + { + "epoch": 0.9446635288131281, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.3158981800079346, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.855475902557373, + "num_tokens": 283247879.0, + "step": 7426 + }, + { + "epoch": 0.9447907390917186, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8482085466384888, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8769727945327759, + "num_tokens": 283285027.0, + "step": 7427 + }, + { + "epoch": 0.9449179493703092, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8558228015899658, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8655693531036377, + "num_tokens": 283324934.0, + "step": 7428 + }, + { + "epoch": 0.9450451596488997, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.892380714416504, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8693289160728455, + "num_tokens": 283357540.0, + "step": 7429 + }, + { + "epoch": 0.9451723699274901, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9386729001998901, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8610256910324097, + "num_tokens": 283393470.0, + "step": 7430 + }, + { + "epoch": 0.9452995802060806, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.882237195968628, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8620771169662476, + "num_tokens": 283429916.0, + "step": 7431 + }, + { + "epoch": 0.9454267904846712, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9147263765335083, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8497284054756165, + "num_tokens": 283467997.0, + "step": 7432 + }, + { + "epoch": 0.9455540007632617, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0854015350341797, + "learning_rate": 1e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.8271380066871643, + "num_tokens": 283504741.0, + "step": 7433 + }, + { + "epoch": 0.9456812110418522, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.090458393096924, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8712239265441895, + "num_tokens": 283534535.0, + "step": 7434 + }, + { + "epoch": 0.9458084213204427, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.034226894378662, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8730365037918091, + "num_tokens": 283571600.0, + "step": 7435 + }, + { + "epoch": 0.9459356315990332, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.013814687728882, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8485420942306519, + "num_tokens": 283605662.0, + "step": 7436 + }, + { + "epoch": 0.9460628418776237, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9393997192382812, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8614287376403809, + "num_tokens": 283642868.0, + "step": 7437 + }, + { + "epoch": 0.9461900521562142, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.1094882488250732, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8517582416534424, + "num_tokens": 283675604.0, + "step": 7438 + }, + { + "epoch": 0.9463172624348047, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8592703342437744, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8828045725822449, + "num_tokens": 283709459.0, + "step": 7439 + }, + { + "epoch": 0.9464444727133953, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8051080703735352, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8618485331535339, + "num_tokens": 283748020.0, + "step": 7440 + }, + { + "epoch": 0.9465716829919858, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0924324989318848, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8568978309631348, + "num_tokens": 283778514.0, + "step": 7441 + }, + { + "epoch": 0.9466988932705762, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8224475383758545, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8653203845024109, + "num_tokens": 283819764.0, + "step": 7442 + }, + { + "epoch": 0.9468261035491667, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0713582038879395, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8477094769477844, + "num_tokens": 283853897.0, + "step": 7443 + }, + { + "epoch": 0.9469533138277573, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 3.0113203525543213, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8761284351348877, + "num_tokens": 283891826.0, + "step": 7444 + }, + { + "epoch": 0.9470805241063478, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.889336347579956, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8605608940124512, + "num_tokens": 283931690.0, + "step": 7445 + }, + { + "epoch": 0.9472077343849383, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8041807413101196, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8618465662002563, + "num_tokens": 283971107.0, + "step": 7446 + }, + { + "epoch": 0.9473349446635289, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9625471830368042, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8681628704071045, + "num_tokens": 284005286.0, + "step": 7447 + }, + { + "epoch": 0.9474621549421193, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7581496238708496, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8758407235145569, + "num_tokens": 284044089.0, + "step": 7448 + }, + { + "epoch": 0.9475893652207098, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.348259449005127, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8568468689918518, + "num_tokens": 284081172.0, + "step": 7449 + }, + { + "epoch": 0.9477165754993003, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.512568712234497, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8692525625228882, + "num_tokens": 284113362.0, + "step": 7450 + }, + { + "epoch": 0.9478437857778909, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.914687991142273, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8709782958030701, + "num_tokens": 284155272.0, + "step": 7451 + }, + { + "epoch": 0.9479709960564814, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.044724464416504, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8693516850471497, + "num_tokens": 284188218.0, + "step": 7452 + }, + { + "epoch": 0.9480982063350719, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9884155988693237, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8650431036949158, + "num_tokens": 284228838.0, + "step": 7453 + }, + { + "epoch": 0.9482254166136623, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.104515314102173, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.858709454536438, + "num_tokens": 284265570.0, + "step": 7454 + }, + { + "epoch": 0.9483526268922529, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9607726335525513, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8521271347999573, + "num_tokens": 284307062.0, + "step": 7455 + }, + { + "epoch": 0.9484798371708434, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8795400857925415, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8616878986358643, + "num_tokens": 284340488.0, + "step": 7456 + }, + { + "epoch": 0.9486070474494339, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8462885618209839, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8605602979660034, + "num_tokens": 284382399.0, + "step": 7457 + }, + { + "epoch": 0.9487342577280244, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9187086820602417, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8486918210983276, + "num_tokens": 284416391.0, + "step": 7458 + }, + { + "epoch": 0.948861468006615, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8179235458374023, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.861854076385498, + "num_tokens": 284457049.0, + "step": 7459 + }, + { + "epoch": 0.9489886782852054, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.007943630218506, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8606085777282715, + "num_tokens": 284494426.0, + "step": 7460 + }, + { + "epoch": 0.9491158885637959, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7893702983856201, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8796334266662598, + "num_tokens": 284533349.0, + "step": 7461 + }, + { + "epoch": 0.9492430988423864, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0125205516815186, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8686326742172241, + "num_tokens": 284567777.0, + "step": 7462 + }, + { + "epoch": 0.949370309120977, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9113034009933472, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8552489280700684, + "num_tokens": 284609120.0, + "step": 7463 + }, + { + "epoch": 0.9494975193995675, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.882136583328247, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8540213108062744, + "num_tokens": 284649221.0, + "step": 7464 + }, + { + "epoch": 0.949624729678158, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9519128799438477, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8678056001663208, + "num_tokens": 284682595.0, + "step": 7465 + }, + { + "epoch": 0.9497519399567486, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9536057710647583, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8576822876930237, + "num_tokens": 284722903.0, + "step": 7466 + }, + { + "epoch": 0.949879150235339, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8700319528579712, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8674817681312561, + "num_tokens": 284759166.0, + "step": 7467 + }, + { + "epoch": 0.9500063605139295, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.851135015487671, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8581815958023071, + "num_tokens": 284803192.0, + "step": 7468 + }, + { + "epoch": 0.95013357079252, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.877358078956604, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8639547228813171, + "num_tokens": 284837088.0, + "step": 7469 + }, + { + "epoch": 0.9502607810711106, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8458340167999268, + "learning_rate": 1e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8429627418518066, + "num_tokens": 284879641.0, + "step": 7470 + }, + { + "epoch": 0.9503879913497011, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9066585302352905, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8507746458053589, + "num_tokens": 284917519.0, + "step": 7471 + }, + { + "epoch": 0.9505152016282916, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9504857063293457, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8519901633262634, + "num_tokens": 284950422.0, + "step": 7472 + }, + { + "epoch": 0.950642411906882, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8753727674484253, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.868898332118988, + "num_tokens": 284986392.0, + "step": 7473 + }, + { + "epoch": 0.9507696221854726, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.878640055656433, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8635802268981934, + "num_tokens": 285027684.0, + "step": 7474 + }, + { + "epoch": 0.9508968324640631, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.024038553237915, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.853205680847168, + "num_tokens": 285059551.0, + "step": 7475 + }, + { + "epoch": 0.9510240427426536, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8799357414245605, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8627433776855469, + "num_tokens": 285100197.0, + "step": 7476 + }, + { + "epoch": 0.9511512530212441, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.992728352546692, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8625699281692505, + "num_tokens": 285130115.0, + "step": 7477 + }, + { + "epoch": 0.9512784632998347, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.858174443244934, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8624695539474487, + "num_tokens": 285169069.0, + "step": 7478 + }, + { + "epoch": 0.9514056735784251, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8222148418426514, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8415759801864624, + "num_tokens": 285209971.0, + "step": 7479 + }, + { + "epoch": 0.9515328838570156, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.874996542930603, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8568553924560547, + "num_tokens": 285250024.0, + "step": 7480 + }, + { + "epoch": 0.9516600941356061, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9820656776428223, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8583076596260071, + "num_tokens": 285284900.0, + "step": 7481 + }, + { + "epoch": 0.9517873044141967, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9159796237945557, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.866605281829834, + "num_tokens": 285325657.0, + "step": 7482 + }, + { + "epoch": 0.9519145146927872, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.1594340801239014, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8610831499099731, + "num_tokens": 285356362.0, + "step": 7483 + }, + { + "epoch": 0.9520417249713777, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.129915475845337, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8613229990005493, + "num_tokens": 285394321.0, + "step": 7484 + }, + { + "epoch": 0.9521689352499682, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9010666608810425, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8809210658073425, + "num_tokens": 285435260.0, + "step": 7485 + }, + { + "epoch": 0.9522961455285587, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.6631793975830078, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8694721460342407, + "num_tokens": 285479449.0, + "step": 7486 + }, + { + "epoch": 0.9524233558071492, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.215977430343628, + "learning_rate": 1e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8416252732276917, + "num_tokens": 285520492.0, + "step": 7487 + }, + { + "epoch": 0.9525505660857397, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9498313665390015, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8588266372680664, + "num_tokens": 285558410.0, + "step": 7488 + }, + { + "epoch": 0.9526777763643303, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.985067367553711, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8423388004302979, + "num_tokens": 285598871.0, + "step": 7489 + }, + { + "epoch": 0.9528049866429208, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8337376117706299, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.853823184967041, + "num_tokens": 285641374.0, + "step": 7490 + }, + { + "epoch": 0.9529321969215112, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9420530796051025, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.872974157333374, + "num_tokens": 285675416.0, + "step": 7491 + }, + { + "epoch": 0.9530594072001017, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9041469097137451, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8661319017410278, + "num_tokens": 285707491.0, + "step": 7492 + }, + { + "epoch": 0.9531866174786923, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.791965126991272, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8507275581359863, + "num_tokens": 285747571.0, + "step": 7493 + }, + { + "epoch": 0.9533138277572828, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.6405640840530396, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8847811818122864, + "num_tokens": 285786362.0, + "step": 7494 + }, + { + "epoch": 0.9534410380358733, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7400799989700317, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8692609667778015, + "num_tokens": 285829401.0, + "step": 7495 + }, + { + "epoch": 0.9535682483144639, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8545132875442505, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8597327470779419, + "num_tokens": 285867853.0, + "step": 7496 + }, + { + "epoch": 0.9536954585930543, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.02262544631958, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8730883598327637, + "num_tokens": 285908497.0, + "step": 7497 + }, + { + "epoch": 0.9538226688716448, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.064774513244629, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8480237126350403, + "num_tokens": 285940528.0, + "step": 7498 + }, + { + "epoch": 0.9539498791502353, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8182963132858276, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.862242579460144, + "num_tokens": 285980206.0, + "step": 7499 + }, + { + "epoch": 0.9540770894288259, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8790075778961182, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.856232225894928, + "num_tokens": 286019409.0, + "step": 7500 + }, + { + "epoch": 0.9542042997074164, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.999006986618042, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8701518774032593, + "num_tokens": 286050955.0, + "step": 7501 + }, + { + "epoch": 0.9543315099860069, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9874173402786255, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8682980537414551, + "num_tokens": 286087567.0, + "step": 7502 + }, + { + "epoch": 0.9544587202645973, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.1911001205444336, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8414658308029175, + "num_tokens": 286126893.0, + "step": 7503 + }, + { + "epoch": 0.9545859305431879, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.940624475479126, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8570019006729126, + "num_tokens": 286165818.0, + "step": 7504 + }, + { + "epoch": 0.9547131408217784, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9796496629714966, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8639140129089355, + "num_tokens": 286208026.0, + "step": 7505 + }, + { + "epoch": 0.9548403511003689, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8902920484542847, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8607209324836731, + "num_tokens": 286245468.0, + "step": 7506 + }, + { + "epoch": 0.9549675613789594, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.008424758911133, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.865294337272644, + "num_tokens": 286283955.0, + "step": 7507 + }, + { + "epoch": 0.95509477165755, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.7190133333206177, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8670440912246704, + "num_tokens": 286326560.0, + "step": 7508 + }, + { + "epoch": 0.9552219819361404, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9609194993972778, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8624739050865173, + "num_tokens": 286365791.0, + "step": 7509 + }, + { + "epoch": 0.9553491922147309, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8269007205963135, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8713642358779907, + "num_tokens": 286404132.0, + "step": 7510 + }, + { + "epoch": 0.9554764024933214, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7943506240844727, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8524002432823181, + "num_tokens": 286444828.0, + "step": 7511 + }, + { + "epoch": 0.955603612771912, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8009841442108154, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8664738535881042, + "num_tokens": 286481725.0, + "step": 7512 + }, + { + "epoch": 0.9557308230505025, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8104790449142456, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8621734380722046, + "num_tokens": 286520354.0, + "step": 7513 + }, + { + "epoch": 0.955858033329093, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8370015621185303, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8596439361572266, + "num_tokens": 286564314.0, + "step": 7514 + }, + { + "epoch": 0.9559852436076836, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7941370010375977, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8600252270698547, + "num_tokens": 286606955.0, + "step": 7515 + }, + { + "epoch": 0.956112453886274, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.943428635597229, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8678052425384521, + "num_tokens": 286640091.0, + "step": 7516 + }, + { + "epoch": 0.9562396641648645, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.254755735397339, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.861122727394104, + "num_tokens": 286679090.0, + "step": 7517 + }, + { + "epoch": 0.956366874443455, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8389410972595215, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8661051988601685, + "num_tokens": 286720260.0, + "step": 7518 + }, + { + "epoch": 0.9564940847220456, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7620419263839722, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.868039071559906, + "num_tokens": 286763992.0, + "step": 7519 + }, + { + "epoch": 0.9566212950006361, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9037317037582397, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8432631492614746, + "num_tokens": 286807068.0, + "step": 7520 + }, + { + "epoch": 0.9567485052792266, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7300498485565186, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8690958023071289, + "num_tokens": 286853020.0, + "step": 7521 + }, + { + "epoch": 0.956875715557817, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9974889755249023, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8515399694442749, + "num_tokens": 286896098.0, + "step": 7522 + }, + { + "epoch": 0.9570029258364076, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8925443887710571, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8462411165237427, + "num_tokens": 286938873.0, + "step": 7523 + }, + { + "epoch": 0.9571301361149981, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.835934042930603, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8663386106491089, + "num_tokens": 286972941.0, + "step": 7524 + }, + { + "epoch": 0.9572573463935886, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.0424399375915527, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8510466814041138, + "num_tokens": 287010818.0, + "step": 7525 + }, + { + "epoch": 0.9573845566721791, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8801566362380981, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8655454516410828, + "num_tokens": 287053841.0, + "step": 7526 + }, + { + "epoch": 0.9575117669507697, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.897830843925476, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8668546080589294, + "num_tokens": 287090900.0, + "step": 7527 + }, + { + "epoch": 0.9576389772293601, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.856843113899231, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8593717813491821, + "num_tokens": 287131203.0, + "step": 7528 + }, + { + "epoch": 0.9577661875079506, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.942273736000061, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8725590705871582, + "num_tokens": 287165949.0, + "step": 7529 + }, + { + "epoch": 0.9578933977865411, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.116499423980713, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8610480427742004, + "num_tokens": 287203376.0, + "step": 7530 + }, + { + "epoch": 0.9580206080651317, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9929442405700684, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8539314866065979, + "num_tokens": 287240391.0, + "step": 7531 + }, + { + "epoch": 0.9581478183437222, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.86008882522583, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8584760427474976, + "num_tokens": 287278485.0, + "step": 7532 + }, + { + "epoch": 0.9582750286223127, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9436219930648804, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8564020991325378, + "num_tokens": 287313626.0, + "step": 7533 + }, + { + "epoch": 0.9584022389009031, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8881407976150513, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8770357966423035, + "num_tokens": 287344532.0, + "step": 7534 + }, + { + "epoch": 0.9585294491794937, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9337589740753174, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.844027042388916, + "num_tokens": 287383069.0, + "step": 7535 + }, + { + "epoch": 0.9586566594580842, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9443334341049194, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8611039519309998, + "num_tokens": 287418554.0, + "step": 7536 + }, + { + "epoch": 0.9587838697366747, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9569610357284546, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8612003326416016, + "num_tokens": 287452449.0, + "step": 7537 + }, + { + "epoch": 0.9589110800152653, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.949042797088623, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8472422957420349, + "num_tokens": 287492247.0, + "step": 7538 + }, + { + "epoch": 0.9590382902938558, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9511674642562866, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8671824932098389, + "num_tokens": 287528651.0, + "step": 7539 + }, + { + "epoch": 0.9591655005724462, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.8487884998321533, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8518863916397095, + "num_tokens": 287565837.0, + "step": 7540 + }, + { + "epoch": 0.9592927108510367, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9638291597366333, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8613694310188293, + "num_tokens": 287599469.0, + "step": 7541 + }, + { + "epoch": 0.9594199211296273, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.845047116279602, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8604716062545776, + "num_tokens": 287637104.0, + "step": 7542 + }, + { + "epoch": 0.9595471314082178, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.108273506164551, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8618309497833252, + "num_tokens": 287679134.0, + "step": 7543 + }, + { + "epoch": 0.9596743416868083, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7578749656677246, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8687598705291748, + "num_tokens": 287720340.0, + "step": 7544 + }, + { + "epoch": 0.9598015519653988, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7465181350708008, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8660540580749512, + "num_tokens": 287760931.0, + "step": 7545 + }, + { + "epoch": 0.9599287622439893, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8692094087600708, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8476991653442383, + "num_tokens": 287801723.0, + "step": 7546 + }, + { + "epoch": 0.9600559725225798, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.853846549987793, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.859408438205719, + "num_tokens": 287837010.0, + "step": 7547 + }, + { + "epoch": 0.9601831828011703, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0891730785369873, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8629149198532104, + "num_tokens": 287874381.0, + "step": 7548 + }, + { + "epoch": 0.9603103930797608, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0050623416900635, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8550679683685303, + "num_tokens": 287916673.0, + "step": 7549 + }, + { + "epoch": 0.9604376033583514, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8517305850982666, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8571586608886719, + "num_tokens": 287956764.0, + "step": 7550 + }, + { + "epoch": 0.9605648136369419, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.78575599193573, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8698225021362305, + "num_tokens": 287996373.0, + "step": 7551 + }, + { + "epoch": 0.9606920239155323, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.243556022644043, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8565762042999268, + "num_tokens": 288033671.0, + "step": 7552 + }, + { + "epoch": 0.9608192341941229, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0014867782592773, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8702309131622314, + "num_tokens": 288068254.0, + "step": 7553 + }, + { + "epoch": 0.9609464444727134, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 1.9887008666992188, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.846877932548523, + "num_tokens": 288102833.0, + "step": 7554 + }, + { + "epoch": 0.9610736547513039, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.997658371925354, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8751330375671387, + "num_tokens": 288135927.0, + "step": 7555 + }, + { + "epoch": 0.9612008650298944, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.733115553855896, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8812341690063477, + "num_tokens": 288173842.0, + "step": 7556 + }, + { + "epoch": 0.961328075308485, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7028369903564453, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8661761283874512, + "num_tokens": 288215007.0, + "step": 7557 + }, + { + "epoch": 0.9614552855870754, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.1967310905456543, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8509384393692017, + "num_tokens": 288251598.0, + "step": 7558 + }, + { + "epoch": 0.9615824958656659, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9596664905548096, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8625326752662659, + "num_tokens": 288292811.0, + "step": 7559 + }, + { + "epoch": 0.9617097061442564, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.949903130531311, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8660528659820557, + "num_tokens": 288337790.0, + "step": 7560 + }, + { + "epoch": 0.961836916422847, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8076446056365967, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8581603169441223, + "num_tokens": 288373347.0, + "step": 7561 + }, + { + "epoch": 0.9619641267014375, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.862322449684143, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8826988935470581, + "num_tokens": 288405650.0, + "step": 7562 + }, + { + "epoch": 0.962091336980028, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8710834980010986, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8687689900398254, + "num_tokens": 288443849.0, + "step": 7563 + }, + { + "epoch": 0.9622185472586186, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.797102689743042, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8419858813285828, + "num_tokens": 288487557.0, + "step": 7564 + }, + { + "epoch": 0.962345757537209, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8564850091934204, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.851929247379303, + "num_tokens": 288529932.0, + "step": 7565 + }, + { + "epoch": 0.9624729678157995, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9977935552597046, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8572136163711548, + "num_tokens": 288568276.0, + "step": 7566 + }, + { + "epoch": 0.96260017809439, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8679457902908325, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8640455603599548, + "num_tokens": 288602387.0, + "step": 7567 + }, + { + "epoch": 0.9627273883729806, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.109771728515625, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8692595958709717, + "num_tokens": 288638157.0, + "step": 7568 + }, + { + "epoch": 0.9628545986515711, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9177738428115845, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.858623743057251, + "num_tokens": 288675054.0, + "step": 7569 + }, + { + "epoch": 0.9629818089301616, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9752304553985596, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8676214814186096, + "num_tokens": 288714222.0, + "step": 7570 + }, + { + "epoch": 0.963109019208752, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9641058444976807, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8517212271690369, + "num_tokens": 288752772.0, + "step": 7571 + }, + { + "epoch": 0.9632362294873426, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.789508581161499, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8465197086334229, + "num_tokens": 288792067.0, + "step": 7572 + }, + { + "epoch": 0.9633634397659331, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8749287128448486, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8732945919036865, + "num_tokens": 288826172.0, + "step": 7573 + }, + { + "epoch": 0.9634906500445236, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.2799220085144043, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8726547956466675, + "num_tokens": 288863774.0, + "step": 7574 + }, + { + "epoch": 0.9636178603231141, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9107452630996704, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.855566143989563, + "num_tokens": 288901892.0, + "step": 7575 + }, + { + "epoch": 0.9637450706017047, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0358080863952637, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8512894511222839, + "num_tokens": 288941470.0, + "step": 7576 + }, + { + "epoch": 0.9638722808802951, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7641735076904297, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8485603332519531, + "num_tokens": 288988616.0, + "step": 7577 + }, + { + "epoch": 0.9639994911588856, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.799103021621704, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8780046701431274, + "num_tokens": 289024589.0, + "step": 7578 + }, + { + "epoch": 0.9641267014374761, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7991149425506592, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8643630743026733, + "num_tokens": 289064334.0, + "step": 7579 + }, + { + "epoch": 0.9642539117160667, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9357367753982544, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8509470820426941, + "num_tokens": 289099385.0, + "step": 7580 + }, + { + "epoch": 0.9643811219946572, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8558290004730225, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8663244247436523, + "num_tokens": 289139143.0, + "step": 7581 + }, + { + "epoch": 0.9645083322732477, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8648650646209717, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8550653457641602, + "num_tokens": 289179303.0, + "step": 7582 + }, + { + "epoch": 0.9646355425518381, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.007375717163086, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8536900877952576, + "num_tokens": 289214218.0, + "step": 7583 + }, + { + "epoch": 0.9647627528304287, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.18656849861145, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8594274520874023, + "num_tokens": 289240991.0, + "step": 7584 + }, + { + "epoch": 0.9648899631090192, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0674359798431396, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8558677434921265, + "num_tokens": 289273374.0, + "step": 7585 + }, + { + "epoch": 0.9650171733876097, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8339707851409912, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8484028577804565, + "num_tokens": 289314904.0, + "step": 7586 + }, + { + "epoch": 0.9651443836662003, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8713356256484985, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8549270629882812, + "num_tokens": 289358110.0, + "step": 7587 + }, + { + "epoch": 0.9652715939447908, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8813570737838745, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8529163599014282, + "num_tokens": 289391581.0, + "step": 7588 + }, + { + "epoch": 0.9653988042233812, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8983142375946045, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8359315395355225, + "num_tokens": 289432280.0, + "step": 7589 + }, + { + "epoch": 0.9655260145019717, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8708810806274414, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.845908522605896, + "num_tokens": 289473222.0, + "step": 7590 + }, + { + "epoch": 0.9656532247805623, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.1052916049957275, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8567733764648438, + "num_tokens": 289512898.0, + "step": 7591 + }, + { + "epoch": 0.9657804350591528, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 5.112425804138184, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8653104901313782, + "num_tokens": 289554394.0, + "step": 7592 + }, + { + "epoch": 0.9659076453377433, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.0093371868133545, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8574339151382446, + "num_tokens": 289594871.0, + "step": 7593 + }, + { + "epoch": 0.9660348556163338, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.271467924118042, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8499786257743835, + "num_tokens": 289629165.0, + "step": 7594 + }, + { + "epoch": 0.9661620658949243, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7942490577697754, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8605265617370605, + "num_tokens": 289670131.0, + "step": 7595 + }, + { + "epoch": 0.9662892761735148, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.1450281143188477, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8649419546127319, + "num_tokens": 289705238.0, + "step": 7596 + }, + { + "epoch": 0.9664164864521053, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9204860925674438, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8640350103378296, + "num_tokens": 289740146.0, + "step": 7597 + }, + { + "epoch": 0.9665436967306958, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.797973394393921, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8653280735015869, + "num_tokens": 289781145.0, + "step": 7598 + }, + { + "epoch": 0.9666709070092864, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.083981990814209, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.844610333442688, + "num_tokens": 289816347.0, + "step": 7599 + }, + { + "epoch": 0.9667981172878769, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8885506391525269, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8471195697784424, + "num_tokens": 289852974.0, + "step": 7600 + }, + { + "epoch": 0.9669253275664673, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9552768468856812, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8492740988731384, + "num_tokens": 289894553.0, + "step": 7601 + }, + { + "epoch": 0.9670525378450578, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0592501163482666, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8427194356918335, + "num_tokens": 289930641.0, + "step": 7602 + }, + { + "epoch": 0.9671797481236484, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8657976388931274, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8555347919464111, + "num_tokens": 289972690.0, + "step": 7603 + }, + { + "epoch": 0.9673069584022389, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8174431324005127, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8541062474250793, + "num_tokens": 290006518.0, + "step": 7604 + }, + { + "epoch": 0.9674341686808294, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8070948123931885, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8483462929725647, + "num_tokens": 290045206.0, + "step": 7605 + }, + { + "epoch": 0.96756137895942, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8506484031677246, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8661366701126099, + "num_tokens": 290086976.0, + "step": 7606 + }, + { + "epoch": 0.9676885892380104, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7795156240463257, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8731198906898499, + "num_tokens": 290126894.0, + "step": 7607 + }, + { + "epoch": 0.9678157995166009, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7928205728530884, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8749586343765259, + "num_tokens": 290161684.0, + "step": 7608 + }, + { + "epoch": 0.9679430097951914, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0673201084136963, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8630107045173645, + "num_tokens": 290203407.0, + "step": 7609 + }, + { + "epoch": 0.968070220073782, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.945599913597107, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8671337366104126, + "num_tokens": 290238645.0, + "step": 7610 + }, + { + "epoch": 0.9681974303523725, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7631666660308838, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8603049516677856, + "num_tokens": 290280279.0, + "step": 7611 + }, + { + "epoch": 0.968324640630963, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8056037425994873, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8598710298538208, + "num_tokens": 290317739.0, + "step": 7612 + }, + { + "epoch": 0.9684518509095535, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8214586973190308, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8600606322288513, + "num_tokens": 290360384.0, + "step": 7613 + }, + { + "epoch": 0.968579061188144, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8311524391174316, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.868309736251831, + "num_tokens": 290398706.0, + "step": 7614 + }, + { + "epoch": 0.9687062714667345, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8291218280792236, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.873683512210846, + "num_tokens": 290439524.0, + "step": 7615 + }, + { + "epoch": 0.968833481745325, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0636587142944336, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8561863303184509, + "num_tokens": 290472125.0, + "step": 7616 + }, + { + "epoch": 0.9689606920239155, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.973657250404358, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8450273871421814, + "num_tokens": 290514348.0, + "step": 7617 + }, + { + "epoch": 0.9690879023025061, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8893839120864868, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8552839756011963, + "num_tokens": 290551699.0, + "step": 7618 + }, + { + "epoch": 0.9692151125810966, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.05745792388916, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8621419668197632, + "num_tokens": 290588644.0, + "step": 7619 + }, + { + "epoch": 0.969342322859687, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8106969594955444, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8621716499328613, + "num_tokens": 290629546.0, + "step": 7620 + }, + { + "epoch": 0.9694695331382776, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.821244716644287, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.868672251701355, + "num_tokens": 290665527.0, + "step": 7621 + }, + { + "epoch": 0.9695967434168681, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.768794059753418, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8519902229309082, + "num_tokens": 290706667.0, + "step": 7622 + }, + { + "epoch": 0.9697239536954586, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.941914439201355, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8560826182365417, + "num_tokens": 290745698.0, + "step": 7623 + }, + { + "epoch": 0.9698511639740491, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.064351797103882, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8659186363220215, + "num_tokens": 290777983.0, + "step": 7624 + }, + { + "epoch": 0.9699783742526397, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.902092456817627, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8550575971603394, + "num_tokens": 290814184.0, + "step": 7625 + }, + { + "epoch": 0.9701055845312301, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7960617542266846, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.852182149887085, + "num_tokens": 290854054.0, + "step": 7626 + }, + { + "epoch": 0.9702327948098206, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9043933153152466, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8595787882804871, + "num_tokens": 290891325.0, + "step": 7627 + }, + { + "epoch": 0.9703600050884111, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8098489046096802, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8592811822891235, + "num_tokens": 290930101.0, + "step": 7628 + }, + { + "epoch": 0.9704872153670017, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9745100736618042, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8614124059677124, + "num_tokens": 290964691.0, + "step": 7629 + }, + { + "epoch": 0.9706144256455922, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8281623125076294, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8579736948013306, + "num_tokens": 291008140.0, + "step": 7630 + }, + { + "epoch": 0.9707416359241827, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8581454753875732, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.855067253112793, + "num_tokens": 291041484.0, + "step": 7631 + }, + { + "epoch": 0.9708688462027731, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.5682506561279297, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8417739868164062, + "num_tokens": 291082337.0, + "step": 7632 + }, + { + "epoch": 0.9709960564813637, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.0805892944335938, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8594015836715698, + "num_tokens": 291115921.0, + "step": 7633 + }, + { + "epoch": 0.9711232667599542, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8435442447662354, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8686856031417847, + "num_tokens": 291154367.0, + "step": 7634 + }, + { + "epoch": 0.9712504770385447, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8157649040222168, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.860373318195343, + "num_tokens": 291198919.0, + "step": 7635 + }, + { + "epoch": 0.9713776873171353, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9683620929718018, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8490958213806152, + "num_tokens": 291237062.0, + "step": 7636 + }, + { + "epoch": 0.9715048975957258, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7492680549621582, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8729797005653381, + "num_tokens": 291275676.0, + "step": 7637 + }, + { + "epoch": 0.9716321078743162, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7801945209503174, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8529097437858582, + "num_tokens": 291316785.0, + "step": 7638 + }, + { + "epoch": 0.9717593181529067, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9924012422561646, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8578521013259888, + "num_tokens": 291360448.0, + "step": 7639 + }, + { + "epoch": 0.9718865284314973, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8032268285751343, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8577942848205566, + "num_tokens": 291401650.0, + "step": 7640 + }, + { + "epoch": 0.9720137387100878, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.317134380340576, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8610570430755615, + "num_tokens": 291439736.0, + "step": 7641 + }, + { + "epoch": 0.9721409489886783, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.853682518005371, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8661887049674988, + "num_tokens": 291479077.0, + "step": 7642 + }, + { + "epoch": 0.9722681592672688, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9654312133789062, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8707801103591919, + "num_tokens": 291512470.0, + "step": 7643 + }, + { + "epoch": 0.9723953695458593, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8382011651992798, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8717107772827148, + "num_tokens": 291552797.0, + "step": 7644 + }, + { + "epoch": 0.9725225798244498, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.038029670715332, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8641948699951172, + "num_tokens": 291589885.0, + "step": 7645 + }, + { + "epoch": 0.9726497901030403, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7990127801895142, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8599624633789062, + "num_tokens": 291633005.0, + "step": 7646 + }, + { + "epoch": 0.9727770003816308, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.012571096420288, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8574634194374084, + "num_tokens": 291666083.0, + "step": 7647 + }, + { + "epoch": 0.9729042106602214, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.851267695426941, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8662711381912231, + "num_tokens": 291706421.0, + "step": 7648 + }, + { + "epoch": 0.9730314209388119, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8187334537506104, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.866003155708313, + "num_tokens": 291744641.0, + "step": 7649 + }, + { + "epoch": 0.9731586312174023, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.883036494255066, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8856797218322754, + "num_tokens": 291780586.0, + "step": 7650 + }, + { + "epoch": 0.9732858414959928, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7939602136611938, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.856383204460144, + "num_tokens": 291820482.0, + "step": 7651 + }, + { + "epoch": 0.9734130517745834, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8476959466934204, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8706212043762207, + "num_tokens": 291858568.0, + "step": 7652 + }, + { + "epoch": 0.9735402620531739, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.15580677986145, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8518947958946228, + "num_tokens": 291889383.0, + "step": 7653 + }, + { + "epoch": 0.9736674723317644, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8091493844985962, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.869225263595581, + "num_tokens": 291927541.0, + "step": 7654 + }, + { + "epoch": 0.973794682610355, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.0351130962371826, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8457884788513184, + "num_tokens": 291969074.0, + "step": 7655 + }, + { + "epoch": 0.9739218928889454, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7794008255004883, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8586466908454895, + "num_tokens": 292008311.0, + "step": 7656 + }, + { + "epoch": 0.9740491031675359, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9522490501403809, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8521310687065125, + "num_tokens": 292051810.0, + "step": 7657 + }, + { + "epoch": 0.9741763134461264, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9289344549179077, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8686336278915405, + "num_tokens": 292090411.0, + "step": 7658 + }, + { + "epoch": 0.974303523724717, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9766615629196167, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8524118661880493, + "num_tokens": 292124411.0, + "step": 7659 + }, + { + "epoch": 0.9744307340033075, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8270349502563477, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8611218333244324, + "num_tokens": 292166428.0, + "step": 7660 + }, + { + "epoch": 0.974557944281898, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.894801378250122, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8631075024604797, + "num_tokens": 292202687.0, + "step": 7661 + }, + { + "epoch": 0.9746851545604885, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.888001561164856, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8703234195709229, + "num_tokens": 292246364.0, + "step": 7662 + }, + { + "epoch": 0.974812364839079, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8684161901474, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8615788221359253, + "num_tokens": 292285789.0, + "step": 7663 + }, + { + "epoch": 0.9749395751176695, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8876235485076904, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8548733592033386, + "num_tokens": 292320755.0, + "step": 7664 + }, + { + "epoch": 0.97506678539626, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7713826894760132, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8697155714035034, + "num_tokens": 292362818.0, + "step": 7665 + }, + { + "epoch": 0.9751939956748505, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7941651344299316, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8531983494758606, + "num_tokens": 292404016.0, + "step": 7666 + }, + { + "epoch": 0.9753212059534411, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.074280261993408, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8478798270225525, + "num_tokens": 292442117.0, + "step": 7667 + }, + { + "epoch": 0.9754484162320316, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9348536729812622, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8678654432296753, + "num_tokens": 292472523.0, + "step": 7668 + }, + { + "epoch": 0.975575626510622, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0030338764190674, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8661689758300781, + "num_tokens": 292508571.0, + "step": 7669 + }, + { + "epoch": 0.9757028367892125, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.96034836769104, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8572189807891846, + "num_tokens": 292545358.0, + "step": 7670 + }, + { + "epoch": 0.9758300470678031, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8486908674240112, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8461065292358398, + "num_tokens": 292583962.0, + "step": 7671 + }, + { + "epoch": 0.9759572573463936, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.917001485824585, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8601704835891724, + "num_tokens": 292616186.0, + "step": 7672 + }, + { + "epoch": 0.9760844676249841, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9009971618652344, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8563898205757141, + "num_tokens": 292657378.0, + "step": 7673 + }, + { + "epoch": 0.9762116779035747, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8057570457458496, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.874906063079834, + "num_tokens": 292692041.0, + "step": 7674 + }, + { + "epoch": 0.9763388881821651, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.015608549118042, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8581683039665222, + "num_tokens": 292725062.0, + "step": 7675 + }, + { + "epoch": 0.9764660984607556, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.702024221420288, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8685933351516724, + "num_tokens": 292766522.0, + "step": 7676 + }, + { + "epoch": 0.9765933087393461, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.83277428150177, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8596393465995789, + "num_tokens": 292811371.0, + "step": 7677 + }, + { + "epoch": 0.9767205190179367, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8527151346206665, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8608803153038025, + "num_tokens": 292850237.0, + "step": 7678 + }, + { + "epoch": 0.9768477292965272, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9577162265777588, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8655134439468384, + "num_tokens": 292887291.0, + "step": 7679 + }, + { + "epoch": 0.9769749395751177, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.870847225189209, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8741002082824707, + "num_tokens": 292928396.0, + "step": 7680 + }, + { + "epoch": 0.9771021498537081, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0236823558807373, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8643519878387451, + "num_tokens": 292962748.0, + "step": 7681 + }, + { + "epoch": 0.9772293601322987, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8496242761611938, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8724149465560913, + "num_tokens": 293002330.0, + "step": 7682 + }, + { + "epoch": 0.9773565704108892, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.868204116821289, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8615763187408447, + "num_tokens": 293038288.0, + "step": 7683 + }, + { + "epoch": 0.9774837806894797, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7500566244125366, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8640108108520508, + "num_tokens": 293078121.0, + "step": 7684 + }, + { + "epoch": 0.9776109909680702, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.067143678665161, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8580278158187866, + "num_tokens": 293116846.0, + "step": 7685 + }, + { + "epoch": 0.9777382012466608, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.8655619621276855, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8569403886795044, + "num_tokens": 293162237.0, + "step": 7686 + }, + { + "epoch": 0.9778654115252512, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9748913049697876, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8574367761611938, + "num_tokens": 293196631.0, + "step": 7687 + }, + { + "epoch": 0.9779926218038417, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8528392314910889, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8701151013374329, + "num_tokens": 293238883.0, + "step": 7688 + }, + { + "epoch": 0.9781198320824323, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.294606924057007, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8513485789299011, + "num_tokens": 293270802.0, + "step": 7689 + }, + { + "epoch": 0.9782470423610228, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.961626410484314, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8478447198867798, + "num_tokens": 293314910.0, + "step": 7690 + }, + { + "epoch": 0.9783742526396133, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8203473091125488, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8741000890731812, + "num_tokens": 293353083.0, + "step": 7691 + }, + { + "epoch": 0.9785014629182038, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9533392190933228, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8625824451446533, + "num_tokens": 293389473.0, + "step": 7692 + }, + { + "epoch": 0.9786286731967943, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.6981403827667236, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8747645616531372, + "num_tokens": 293426893.0, + "step": 7693 + }, + { + "epoch": 0.9787558834753848, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.792639136314392, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8668573498725891, + "num_tokens": 293468116.0, + "step": 7694 + }, + { + "epoch": 0.9788830937539753, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0057101249694824, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8649875521659851, + "num_tokens": 293503050.0, + "step": 7695 + }, + { + "epoch": 0.9790103040325658, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8401838541030884, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.85108482837677, + "num_tokens": 293541673.0, + "step": 7696 + }, + { + "epoch": 0.9791375143111564, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.4428679943084717, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8576993942260742, + "num_tokens": 293578472.0, + "step": 7697 + }, + { + "epoch": 0.9792647245897469, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.780355215072632, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8637963533401489, + "num_tokens": 293611204.0, + "step": 7698 + }, + { + "epoch": 0.9793919348683373, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9235317707061768, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8512721657752991, + "num_tokens": 293651352.0, + "step": 7699 + }, + { + "epoch": 0.9795191451469278, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7999677658081055, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8727895021438599, + "num_tokens": 293687396.0, + "step": 7700 + }, + { + "epoch": 0.9796463554255184, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9371461868286133, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8570427894592285, + "num_tokens": 293725083.0, + "step": 7701 + }, + { + "epoch": 0.9797735657041089, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8081210851669312, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8612925410270691, + "num_tokens": 293766136.0, + "step": 7702 + }, + { + "epoch": 0.9799007759826994, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9334441423416138, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8439613580703735, + "num_tokens": 293805810.0, + "step": 7703 + }, + { + "epoch": 0.98002798626129, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9340975284576416, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.853621780872345, + "num_tokens": 293839041.0, + "step": 7704 + }, + { + "epoch": 0.9801551965398804, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8712375164031982, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8756871223449707, + "num_tokens": 293876541.0, + "step": 7705 + }, + { + "epoch": 0.9802824068184709, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8278058767318726, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8761171698570251, + "num_tokens": 293918141.0, + "step": 7706 + }, + { + "epoch": 0.9804096170970614, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.659259557723999, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8483511209487915, + "num_tokens": 293958078.0, + "step": 7707 + }, + { + "epoch": 0.980536827375652, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.0855517387390137, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8716253042221069, + "num_tokens": 293990347.0, + "step": 7708 + }, + { + "epoch": 0.9806640376542425, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.1086294651031494, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8664158582687378, + "num_tokens": 294032136.0, + "step": 7709 + }, + { + "epoch": 0.980791247932833, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.136138677597046, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8480348587036133, + "num_tokens": 294065136.0, + "step": 7710 + }, + { + "epoch": 0.9809184582114235, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.1436550617218018, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8749518394470215, + "num_tokens": 294104539.0, + "step": 7711 + }, + { + "epoch": 0.981045668490014, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7179193496704102, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8593523502349854, + "num_tokens": 294148962.0, + "step": 7712 + }, + { + "epoch": 0.9811728787686045, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8696237802505493, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8657280802726746, + "num_tokens": 294181866.0, + "step": 7713 + }, + { + "epoch": 0.981300089047195, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.743935465812683, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8703305721282959, + "num_tokens": 294223558.0, + "step": 7714 + }, + { + "epoch": 0.9814272993257855, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0083582401275635, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8658409118652344, + "num_tokens": 294260487.0, + "step": 7715 + }, + { + "epoch": 0.9815545096043761, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9439772367477417, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8623024821281433, + "num_tokens": 294302642.0, + "step": 7716 + }, + { + "epoch": 0.9816817198829666, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0338573455810547, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.851569652557373, + "num_tokens": 294338376.0, + "step": 7717 + }, + { + "epoch": 0.981808930161557, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0684032440185547, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8559603691101074, + "num_tokens": 294372089.0, + "step": 7718 + }, + { + "epoch": 0.9819361404401475, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.032050371170044, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.864851713180542, + "num_tokens": 294403801.0, + "step": 7719 + }, + { + "epoch": 0.9820633507187381, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8757765293121338, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8646627068519592, + "num_tokens": 294440301.0, + "step": 7720 + }, + { + "epoch": 0.9821905609973286, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8870570659637451, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8605128526687622, + "num_tokens": 294472967.0, + "step": 7721 + }, + { + "epoch": 0.9823177712759191, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9596385955810547, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.851615309715271, + "num_tokens": 294518248.0, + "step": 7722 + }, + { + "epoch": 0.9824449815545097, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.997774362564087, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8603092432022095, + "num_tokens": 294553899.0, + "step": 7723 + }, + { + "epoch": 0.9825721918331001, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9121522903442383, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8390809893608093, + "num_tokens": 294591382.0, + "step": 7724 + }, + { + "epoch": 0.9826994021116906, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8760545253753662, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8625271320343018, + "num_tokens": 294629606.0, + "step": 7725 + }, + { + "epoch": 0.9828266123902811, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8380827903747559, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8642146587371826, + "num_tokens": 294669417.0, + "step": 7726 + }, + { + "epoch": 0.9829538226688717, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7361177206039429, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8719696998596191, + "num_tokens": 294709876.0, + "step": 7727 + }, + { + "epoch": 0.9830810329474622, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8690004348754883, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8529539108276367, + "num_tokens": 294750565.0, + "step": 7728 + }, + { + "epoch": 0.9832082432260527, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0720107555389404, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8619776964187622, + "num_tokens": 294790586.0, + "step": 7729 + }, + { + "epoch": 0.9833354535046431, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8823195695877075, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8599481582641602, + "num_tokens": 294830176.0, + "step": 7730 + }, + { + "epoch": 0.9834626637832337, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8642399311065674, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8493330478668213, + "num_tokens": 294874199.0, + "step": 7731 + }, + { + "epoch": 0.9835898740618242, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.860192060470581, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8689022064208984, + "num_tokens": 294914219.0, + "step": 7732 + }, + { + "epoch": 0.9837170843404147, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9763370752334595, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.839128851890564, + "num_tokens": 294950726.0, + "step": 7733 + }, + { + "epoch": 0.9838442946190052, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9036853313446045, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8612545728683472, + "num_tokens": 294993815.0, + "step": 7734 + }, + { + "epoch": 0.9839715048975958, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8302314281463623, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8708875179290771, + "num_tokens": 295029275.0, + "step": 7735 + }, + { + "epoch": 0.9840987151761862, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8546916246414185, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8616988062858582, + "num_tokens": 295066166.0, + "step": 7736 + }, + { + "epoch": 0.9842259254547767, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9462532997131348, + "learning_rate": 1e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8394870162010193, + "num_tokens": 295100436.0, + "step": 7737 + }, + { + "epoch": 0.9843531357333672, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7054356336593628, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8620749711990356, + "num_tokens": 295141756.0, + "step": 7738 + }, + { + "epoch": 0.9844803460119578, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.78025484085083, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8789706230163574, + "num_tokens": 295178894.0, + "step": 7739 + }, + { + "epoch": 0.9846075562905483, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.089658498764038, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8507670164108276, + "num_tokens": 295219133.0, + "step": 7740 + }, + { + "epoch": 0.9847347665691388, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.9313936233520508, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8614369630813599, + "num_tokens": 295260322.0, + "step": 7741 + }, + { + "epoch": 0.9848619768477292, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8086272478103638, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8545289635658264, + "num_tokens": 295299675.0, + "step": 7742 + }, + { + "epoch": 0.9849891871263198, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8061569929122925, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8665262460708618, + "num_tokens": 295336281.0, + "step": 7743 + }, + { + "epoch": 0.9851163974049103, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8697478771209717, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8597333431243896, + "num_tokens": 295376932.0, + "step": 7744 + }, + { + "epoch": 0.9852436076835008, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8423198461532593, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8580132722854614, + "num_tokens": 295421904.0, + "step": 7745 + }, + { + "epoch": 0.9853708179620914, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8976871967315674, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8623106479644775, + "num_tokens": 295456688.0, + "step": 7746 + }, + { + "epoch": 0.9854980282406819, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7407362461090088, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8785154819488525, + "num_tokens": 295500022.0, + "step": 7747 + }, + { + "epoch": 0.9856252385192723, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.763546109199524, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8812054395675659, + "num_tokens": 295538106.0, + "step": 7748 + }, + { + "epoch": 0.9857524487978628, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8921526670455933, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.857244610786438, + "num_tokens": 295572465.0, + "step": 7749 + }, + { + "epoch": 0.9858796590764534, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.883049488067627, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8666199445724487, + "num_tokens": 295614577.0, + "step": 7750 + }, + { + "epoch": 0.9860068693550439, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.803828477859497, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8650515079498291, + "num_tokens": 295654780.0, + "step": 7751 + }, + { + "epoch": 0.9861340796336344, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.752740502357483, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.860648512840271, + "num_tokens": 295695880.0, + "step": 7752 + }, + { + "epoch": 0.986261289912225, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.839872121810913, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8618522882461548, + "num_tokens": 295738176.0, + "step": 7753 + }, + { + "epoch": 0.9863885001908154, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.0044360160827637, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8504757285118103, + "num_tokens": 295777562.0, + "step": 7754 + }, + { + "epoch": 0.9865157104694059, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7329026460647583, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8772673010826111, + "num_tokens": 295815013.0, + "step": 7755 + }, + { + "epoch": 0.9866429207479964, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.805206537246704, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8500048518180847, + "num_tokens": 295857751.0, + "step": 7756 + }, + { + "epoch": 0.986770131026587, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7781155109405518, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8679358959197998, + "num_tokens": 295896033.0, + "step": 7757 + }, + { + "epoch": 0.9868973413051775, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8158977031707764, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8641376495361328, + "num_tokens": 295936838.0, + "step": 7758 + }, + { + "epoch": 0.987024551583768, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9302809238433838, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8667564988136292, + "num_tokens": 295972307.0, + "step": 7759 + }, + { + "epoch": 0.9871517618623584, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9106541872024536, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8594808578491211, + "num_tokens": 296009966.0, + "step": 7760 + }, + { + "epoch": 0.987278972140949, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.3994359970092773, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.85872882604599, + "num_tokens": 296047874.0, + "step": 7761 + }, + { + "epoch": 0.9874061824195395, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.764054536819458, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8686330318450928, + "num_tokens": 296090591.0, + "step": 7762 + }, + { + "epoch": 0.98753339269813, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.1753480434417725, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8573777079582214, + "num_tokens": 296133024.0, + "step": 7763 + }, + { + "epoch": 0.9876606029767205, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8866990804672241, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8494260907173157, + "num_tokens": 296177347.0, + "step": 7764 + }, + { + "epoch": 0.9877878132553111, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7892874479293823, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8521619439125061, + "num_tokens": 296215532.0, + "step": 7765 + }, + { + "epoch": 0.9879150235339016, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.973597526550293, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8505675792694092, + "num_tokens": 296249494.0, + "step": 7766 + }, + { + "epoch": 0.988042233812492, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9226107597351074, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8525747656822205, + "num_tokens": 296289370.0, + "step": 7767 + }, + { + "epoch": 0.9881694440910825, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.255009412765503, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8624200820922852, + "num_tokens": 296329376.0, + "step": 7768 + }, + { + "epoch": 0.9882966543696731, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.928321123123169, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8675245642662048, + "num_tokens": 296368875.0, + "step": 7769 + }, + { + "epoch": 0.9884238646482636, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.877277135848999, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.859682559967041, + "num_tokens": 296404766.0, + "step": 7770 + }, + { + "epoch": 0.9885510749268541, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8699461221694946, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8596104383468628, + "num_tokens": 296441705.0, + "step": 7771 + }, + { + "epoch": 0.9886782852054447, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8761193752288818, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8680589199066162, + "num_tokens": 296481075.0, + "step": 7772 + }, + { + "epoch": 0.9888054954840351, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9923508167266846, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.857832133769989, + "num_tokens": 296511404.0, + "step": 7773 + }, + { + "epoch": 0.9889327057626256, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.0700526237487793, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8518053293228149, + "num_tokens": 296546080.0, + "step": 7774 + }, + { + "epoch": 0.9890599160412161, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8449584245681763, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8630260229110718, + "num_tokens": 296586119.0, + "step": 7775 + }, + { + "epoch": 0.9891871263198067, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.358687400817871, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.859699547290802, + "num_tokens": 296629602.0, + "step": 7776 + }, + { + "epoch": 0.9893143365983972, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8778923749923706, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.854114830493927, + "num_tokens": 296666709.0, + "step": 7777 + }, + { + "epoch": 0.9894415468769877, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9285430908203125, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8604632019996643, + "num_tokens": 296702611.0, + "step": 7778 + }, + { + "epoch": 0.9895687571555781, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9409247636795044, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.866762638092041, + "num_tokens": 296737649.0, + "step": 7779 + }, + { + "epoch": 0.9896959674341687, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7872778177261353, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8622824549674988, + "num_tokens": 296776915.0, + "step": 7780 + }, + { + "epoch": 0.9898231777127592, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.883831262588501, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8682481646537781, + "num_tokens": 296811491.0, + "step": 7781 + }, + { + "epoch": 0.9899503879913497, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.903584361076355, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8656386137008667, + "num_tokens": 296847602.0, + "step": 7782 + }, + { + "epoch": 0.9900775982699402, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.0125672817230225, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8442187309265137, + "num_tokens": 296884737.0, + "step": 7783 + }, + { + "epoch": 0.9902048085485308, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9140161275863647, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8495422601699829, + "num_tokens": 296927780.0, + "step": 7784 + }, + { + "epoch": 0.9903320188271212, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.0351107120513916, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8763101100921631, + "num_tokens": 296964200.0, + "step": 7785 + }, + { + "epoch": 0.9904592291057117, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.789980173110962, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8527801036834717, + "num_tokens": 297009927.0, + "step": 7786 + }, + { + "epoch": 0.9905864393843022, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.060239553451538, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.847962498664856, + "num_tokens": 297050788.0, + "step": 7787 + }, + { + "epoch": 0.9907136496628928, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 3.0167782306671143, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.866523027420044, + "num_tokens": 297092398.0, + "step": 7788 + }, + { + "epoch": 0.9908408599414833, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.263395309448242, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8588548898696899, + "num_tokens": 297130728.0, + "step": 7789 + }, + { + "epoch": 0.9909680702200738, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7935059070587158, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8841153383255005, + "num_tokens": 297169124.0, + "step": 7790 + }, + { + "epoch": 0.9910952804986642, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.888493299484253, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8714208602905273, + "num_tokens": 297204914.0, + "step": 7791 + }, + { + "epoch": 0.9912224907772548, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.0998053550720215, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8579224348068237, + "num_tokens": 297240551.0, + "step": 7792 + }, + { + "epoch": 0.9913497010558453, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.6911460161209106, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8524956703186035, + "num_tokens": 297287160.0, + "step": 7793 + }, + { + "epoch": 0.9914769113344358, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8225802183151245, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8632733225822449, + "num_tokens": 297323394.0, + "step": 7794 + }, + { + "epoch": 0.9916041216130264, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.0940144062042236, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8394811749458313, + "num_tokens": 297357808.0, + "step": 7795 + }, + { + "epoch": 0.9917313318916169, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7939724922180176, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8578897714614868, + "num_tokens": 297397300.0, + "step": 7796 + }, + { + "epoch": 0.9918585421702073, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.795745611190796, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8695409297943115, + "num_tokens": 297434699.0, + "step": 7797 + }, + { + "epoch": 0.9919857524487978, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8997528553009033, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8610695600509644, + "num_tokens": 297473939.0, + "step": 7798 + }, + { + "epoch": 0.9921129627273884, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.093364953994751, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8535885810852051, + "num_tokens": 297511154.0, + "step": 7799 + }, + { + "epoch": 0.9922401730059789, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7167763710021973, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8516921401023865, + "num_tokens": 297556092.0, + "step": 7800 + }, + { + "epoch": 0.9923673832845694, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7011057138442993, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8651664853096008, + "num_tokens": 297598160.0, + "step": 7801 + }, + { + "epoch": 0.9924945935631599, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9089736938476562, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8782222270965576, + "num_tokens": 297638706.0, + "step": 7802 + }, + { + "epoch": 0.9926218038417504, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.7748363018035889, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8659034967422485, + "num_tokens": 297675955.0, + "step": 7803 + }, + { + "epoch": 0.9927490141203409, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8405033349990845, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8571820855140686, + "num_tokens": 297715362.0, + "step": 7804 + }, + { + "epoch": 0.9928762243989314, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9102251529693604, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8476317524909973, + "num_tokens": 297750931.0, + "step": 7805 + }, + { + "epoch": 0.993003434677522, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9053423404693604, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8527694940567017, + "num_tokens": 297786107.0, + "step": 7806 + }, + { + "epoch": 0.9931306449561125, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8979524374008179, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.860932469367981, + "num_tokens": 297824782.0, + "step": 7807 + }, + { + "epoch": 0.993257855234703, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9296432733535767, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8573903441429138, + "num_tokens": 297861748.0, + "step": 7808 + }, + { + "epoch": 0.9933850655132934, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8614308834075928, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8612527847290039, + "num_tokens": 297902722.0, + "step": 7809 + }, + { + "epoch": 0.993512275791884, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7170430421829224, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.863059401512146, + "num_tokens": 297948709.0, + "step": 7810 + }, + { + "epoch": 0.9936394860704745, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.889835000038147, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8640569448471069, + "num_tokens": 297986446.0, + "step": 7811 + }, + { + "epoch": 0.993766696349065, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8719121217727661, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8548226356506348, + "num_tokens": 298024004.0, + "step": 7812 + }, + { + "epoch": 0.9938939066276555, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.9157757759094238, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8512375354766846, + "num_tokens": 298065518.0, + "step": 7813 + }, + { + "epoch": 0.9940211169062461, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8293559551239014, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8543742895126343, + "num_tokens": 298108138.0, + "step": 7814 + }, + { + "epoch": 0.9941483271848366, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.830061912536621, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8607710599899292, + "num_tokens": 298149348.0, + "step": 7815 + }, + { + "epoch": 0.994275537463427, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8659743070602417, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8582727909088135, + "num_tokens": 298186347.0, + "step": 7816 + }, + { + "epoch": 0.9944027477420175, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.969589114189148, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8398646116256714, + "num_tokens": 298223322.0, + "step": 7817 + }, + { + "epoch": 0.9945299580206081, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9090913534164429, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8555819988250732, + "num_tokens": 298258794.0, + "step": 7818 + }, + { + "epoch": 0.9946571682991986, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9367990493774414, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8542791604995728, + "num_tokens": 298297510.0, + "step": 7819 + }, + { + "epoch": 0.9947843785777891, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.990802526473999, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8659629821777344, + "num_tokens": 298333784.0, + "step": 7820 + }, + { + "epoch": 0.9949115888563796, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.0420944690704346, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8666002154350281, + "num_tokens": 298371241.0, + "step": 7821 + }, + { + "epoch": 0.9950387991349701, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.23595929145813, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8467153310775757, + "num_tokens": 298407902.0, + "step": 7822 + }, + { + "epoch": 0.9951660094135606, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7832887172698975, + "learning_rate": 1e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8356016278266907, + "num_tokens": 298452339.0, + "step": 7823 + }, + { + "epoch": 0.9952932196921511, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.7291638851165771, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8515858054161072, + "num_tokens": 298495854.0, + "step": 7824 + }, + { + "epoch": 0.9954204299707416, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.87252938747406, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8556068539619446, + "num_tokens": 298529642.0, + "step": 7825 + }, + { + "epoch": 0.9955476402493322, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7623287439346313, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8623277544975281, + "num_tokens": 298574396.0, + "step": 7826 + }, + { + "epoch": 0.9956748505279227, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.833997130393982, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8478938937187195, + "num_tokens": 298612457.0, + "step": 7827 + }, + { + "epoch": 0.9958020608065131, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8326022624969482, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8607975244522095, + "num_tokens": 298645061.0, + "step": 7828 + }, + { + "epoch": 0.9959292710851037, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8529698848724365, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8584308624267578, + "num_tokens": 298688014.0, + "step": 7829 + }, + { + "epoch": 0.9960564813636942, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8166712522506714, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8642172813415527, + "num_tokens": 298727815.0, + "step": 7830 + }, + { + "epoch": 0.9961836916422847, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.7522951364517212, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8796147108078003, + "num_tokens": 298765663.0, + "step": 7831 + }, + { + "epoch": 0.9963109019208752, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9994968175888062, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8647175431251526, + "num_tokens": 298799360.0, + "step": 7832 + }, + { + "epoch": 0.9964381121994658, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8735103607177734, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.860211193561554, + "num_tokens": 298835669.0, + "step": 7833 + }, + { + "epoch": 0.9965653224780562, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.091677188873291, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8585818409919739, + "num_tokens": 298866871.0, + "step": 7834 + }, + { + "epoch": 0.9966925327566467, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8519233465194702, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8511684536933899, + "num_tokens": 298904136.0, + "step": 7835 + }, + { + "epoch": 0.9968197430352372, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.214468002319336, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8732116222381592, + "num_tokens": 298937069.0, + "step": 7836 + }, + { + "epoch": 0.9969469533138278, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.0055348873138428, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.852694034576416, + "num_tokens": 298976778.0, + "step": 7837 + }, + { + "epoch": 0.9970741635924183, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8959896564483643, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8705893754959106, + "num_tokens": 299010808.0, + "step": 7838 + }, + { + "epoch": 0.9972013738710088, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.012768507003784, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.848409116268158, + "num_tokens": 299047611.0, + "step": 7839 + }, + { + "epoch": 0.9973285841495992, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.765385389328003, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8660250306129456, + "num_tokens": 299088668.0, + "step": 7840 + }, + { + "epoch": 0.9974557944281898, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.853406310081482, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8398628830909729, + "num_tokens": 299126572.0, + "step": 7841 + }, + { + "epoch": 0.9975830047067803, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.7531371116638184, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8713451623916626, + "num_tokens": 299164806.0, + "step": 7842 + }, + { + "epoch": 0.9977102149853708, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8220782279968262, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.857489824295044, + "num_tokens": 299207039.0, + "step": 7843 + }, + { + "epoch": 0.9978374252639614, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8302934169769287, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.868605375289917, + "num_tokens": 299243242.0, + "step": 7844 + }, + { + "epoch": 0.9979646355425519, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.909674048423767, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8659958839416504, + "num_tokens": 299282326.0, + "step": 7845 + }, + { + "epoch": 0.9980918458211423, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7872744798660278, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8886690139770508, + "num_tokens": 299316645.0, + "step": 7846 + }, + { + "epoch": 0.9982190560997328, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9352376461029053, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8632946014404297, + "num_tokens": 299349932.0, + "step": 7847 + }, + { + "epoch": 0.9983462663783234, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.7556555271148682, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8393235206604004, + "num_tokens": 299392452.0, + "step": 7848 + }, + { + "epoch": 0.9984734766569139, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9667202234268188, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8662156462669373, + "num_tokens": 299431479.0, + "step": 7849 + }, + { + "epoch": 0.9986006869355044, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8799234628677368, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8573039174079895, + "num_tokens": 299475585.0, + "step": 7850 + }, + { + "epoch": 0.9987278972140949, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.798414707183838, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8769501447677612, + "num_tokens": 299514026.0, + "step": 7851 + }, + { + "epoch": 0.9988551074926854, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8968530893325806, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8496522903442383, + "num_tokens": 299550858.0, + "step": 7852 + }, + { + "epoch": 0.9989823177712759, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.887329339981079, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8537917137145996, + "num_tokens": 299586831.0, + "step": 7853 + }, + { + "epoch": 0.9991095280498664, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.827721357345581, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8703428506851196, + "num_tokens": 299623116.0, + "step": 7854 + }, + { + "epoch": 0.9992367383284569, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.8871712684631348, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8636168837547302, + "num_tokens": 299662165.0, + "step": 7855 + }, + { + "epoch": 0.9993639486070475, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8060176372528076, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8568155169487, + "num_tokens": 299702581.0, + "step": 7856 + }, + { + "epoch": 0.999491158885638, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8561642169952393, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8464255332946777, + "num_tokens": 299742559.0, + "step": 7857 + }, + { + "epoch": 0.9996183691642284, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8595728874206543, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8755044341087341, + "num_tokens": 299778637.0, + "step": 7858 + }, + { + "epoch": 0.9997455794428189, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.4159276485443115, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8702194094657898, + "num_tokens": 299812808.0, + "step": 7859 + }, + { + "epoch": 0.9998727897214095, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8895338773727417, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8573687076568604, + "num_tokens": 299848987.0, + "step": 7860 + }, + { + "epoch": 1.0, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.196647882461548, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8431611061096191, + "num_tokens": 299886286.0, + "step": 7861 + }, + { + "epoch": 1.0001272102785905, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8828339576721191, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8585824966430664, + "num_tokens": 299925456.0, + "step": 7862 + }, + { + "epoch": 1.000254420557181, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.776369571685791, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8581514358520508, + "num_tokens": 299965936.0, + "step": 7863 + }, + { + "epoch": 1.0003816308357716, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8191107511520386, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8748610019683838, + "num_tokens": 300003181.0, + "step": 7864 + }, + { + "epoch": 1.0005088411143621, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.9548918008804321, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.859771728515625, + "num_tokens": 300040502.0, + "step": 7865 + }, + { + "epoch": 1.0006360513929526, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8426973819732666, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8773081302642822, + "num_tokens": 300078795.0, + "step": 7866 + }, + { + "epoch": 1.0007632616715432, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.0076518058776855, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8717788457870483, + "num_tokens": 300115188.0, + "step": 7867 + }, + { + "epoch": 1.0008904719501335, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8236626386642456, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8628727793693542, + "num_tokens": 300158583.0, + "step": 7868 + }, + { + "epoch": 1.001017682228724, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9830231666564941, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8655173778533936, + "num_tokens": 300194415.0, + "step": 7869 + }, + { + "epoch": 1.0011448925073145, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.9391398429870605, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8572843074798584, + "num_tokens": 300235066.0, + "step": 7870 + }, + { + "epoch": 1.001272102785905, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.9442927837371826, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8650568127632141, + "num_tokens": 300272167.0, + "step": 7871 + }, + { + "epoch": 1.0013993130644956, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.676987409591675, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8534365296363831, + "num_tokens": 300306032.0, + "step": 7872 + }, + { + "epoch": 1.0015265233430861, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.1476762294769287, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8607531785964966, + "num_tokens": 300338433.0, + "step": 7873 + }, + { + "epoch": 1.0016537336216766, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.0747530460357666, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8528424501419067, + "num_tokens": 300376527.0, + "step": 7874 + }, + { + "epoch": 1.0017809439002672, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.9535508155822754, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8796360492706299, + "num_tokens": 300417309.0, + "step": 7875 + }, + { + "epoch": 1.0019081541788577, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.7644197940826416, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8643550872802734, + "num_tokens": 300459438.0, + "step": 7876 + }, + { + "epoch": 1.0020353644574482, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.0475924015045166, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.840049147605896, + "num_tokens": 300501443.0, + "step": 7877 + }, + { + "epoch": 1.0021625747360388, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.9741015434265137, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.870962381362915, + "num_tokens": 300531918.0, + "step": 7878 + }, + { + "epoch": 1.0022897850146293, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.823204517364502, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8564059138298035, + "num_tokens": 300575084.0, + "step": 7879 + }, + { + "epoch": 1.0024169952932196, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.0031425952911377, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8620002269744873, + "num_tokens": 300608737.0, + "step": 7880 + }, + { + "epoch": 1.0025442055718101, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 1.8176658153533936, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8506109118461609, + "num_tokens": 300647855.0, + "step": 7881 + }, + { + "epoch": 1.0026714158504006, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8533657789230347, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8633208274841309, + "num_tokens": 300689763.0, + "step": 7882 + }, + { + "epoch": 1.0027986261289912, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.9987174272537231, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8727676272392273, + "num_tokens": 300725838.0, + "step": 7883 + }, + { + "epoch": 1.0029258364075817, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.9337233304977417, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8687068819999695, + "num_tokens": 300760423.0, + "step": 7884 + }, + { + "epoch": 1.0030530466861722, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.910099744796753, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8678391575813293, + "num_tokens": 300796236.0, + "step": 7885 + }, + { + "epoch": 1.0031802569647628, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.7604035139083862, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8674954771995544, + "num_tokens": 300835213.0, + "step": 7886 + }, + { + "epoch": 1.0033074672433533, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.8605973720550537, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.869697630405426, + "num_tokens": 300873363.0, + "step": 7887 + }, + { + "epoch": 1.0034346775219438, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 2.022433280944824, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8581656217575073, + "num_tokens": 300910233.0, + "step": 7888 + }, + { + "epoch": 1.0035618878005343, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.931687831878662, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8704641461372375, + "num_tokens": 300946986.0, + "step": 7889 + }, + { + "epoch": 1.0036890980791249, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.9568208456039429, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8641438484191895, + "num_tokens": 300985750.0, + "step": 7890 + }, + { + "epoch": 1.0038163083577154, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.8008009195327759, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8614320158958435, + "num_tokens": 301032820.0, + "step": 7891 + }, + { + "epoch": 1.0039435186363057, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.9118350744247437, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8529930114746094, + "num_tokens": 301070825.0, + "step": 7892 + }, + { + "epoch": 1.0040707289148962, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.8685636520385742, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8602361083030701, + "num_tokens": 301110764.0, + "step": 7893 + }, + { + "epoch": 1.0041979391934868, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.856882095336914, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8657606244087219, + "num_tokens": 301152421.0, + "step": 7894 + }, + { + "epoch": 1.0043251494720773, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.80238676071167, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8634230494499207, + "num_tokens": 301192780.0, + "step": 7895 + }, + { + "epoch": 1.0044523597506678, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.124295473098755, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8594411611557007, + "num_tokens": 301228147.0, + "step": 7896 + }, + { + "epoch": 1.0045795700292584, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.8430583477020264, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8719319105148315, + "num_tokens": 301259568.0, + "step": 7897 + }, + { + "epoch": 1.0047067803078489, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.0433666706085205, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.863112211227417, + "num_tokens": 301296483.0, + "step": 7898 + }, + { + "epoch": 1.0048339905864394, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.8324012756347656, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8641622066497803, + "num_tokens": 301337437.0, + "step": 7899 + }, + { + "epoch": 1.00496120086503, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.069037914276123, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.863275945186615, + "num_tokens": 301368777.0, + "step": 7900 + }, + { + "epoch": 1.0050884111436205, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.0165061950683594, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8601835370063782, + "num_tokens": 301404885.0, + "step": 7901 + }, + { + "epoch": 1.005215621422211, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.8339853286743164, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8521856665611267, + "num_tokens": 301445308.0, + "step": 7902 + }, + { + "epoch": 1.0053428317008015, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.9902284145355225, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8612041473388672, + "num_tokens": 301484845.0, + "step": 7903 + }, + { + "epoch": 1.0054700419793918, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.8622874021530151, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.861896812915802, + "num_tokens": 301524498.0, + "step": 7904 + }, + { + "epoch": 1.0055972522579824, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.9990416765213013, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8680127859115601, + "num_tokens": 301565522.0, + "step": 7905 + }, + { + "epoch": 1.0057244625365729, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.0040578842163086, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8586041927337646, + "num_tokens": 301603724.0, + "step": 7906 + }, + { + "epoch": 1.0058516728151634, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.130690813064575, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8610855340957642, + "num_tokens": 301636650.0, + "step": 7907 + }, + { + "epoch": 1.005978883093754, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.913346529006958, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8526497483253479, + "num_tokens": 301675328.0, + "step": 7908 + }, + { + "epoch": 1.0061060933723445, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.004611015319824, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8562163710594177, + "num_tokens": 301710058.0, + "step": 7909 + }, + { + "epoch": 1.006233303650935, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.7281014919281006, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8569413423538208, + "num_tokens": 301753567.0, + "step": 7910 + }, + { + "epoch": 1.0063605139295255, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.8574196100234985, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.869373083114624, + "num_tokens": 301792895.0, + "step": 7911 + }, + { + "epoch": 1.006487724208116, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.761643886566162, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8738837242126465, + "num_tokens": 301833001.0, + "step": 7912 + }, + { + "epoch": 1.0066149344867066, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.0019006729125977, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.865587592124939, + "num_tokens": 301871623.0, + "step": 7913 + }, + { + "epoch": 1.006742144765297, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.8913869857788086, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8748546242713928, + "num_tokens": 301903750.0, + "step": 7914 + }, + { + "epoch": 1.0068693550438876, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.7680959701538086, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.862587571144104, + "num_tokens": 301943079.0, + "step": 7915 + }, + { + "epoch": 1.0069965653224782, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.8773149251937866, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8682336807250977, + "num_tokens": 301980031.0, + "step": 7916 + }, + { + "epoch": 1.0071237756010685, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.7436319589614868, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8622691631317139, + "num_tokens": 302020454.0, + "step": 7917 + }, + { + "epoch": 1.007250985879659, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.7689651250839233, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8566179871559143, + "num_tokens": 302064391.0, + "step": 7918 + }, + { + "epoch": 1.0073781961582495, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.9334992170333862, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8625097870826721, + "num_tokens": 302101598.0, + "step": 7919 + }, + { + "epoch": 1.00750540643684, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.0543205738067627, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.866395115852356, + "num_tokens": 302138803.0, + "step": 7920 + }, + { + "epoch": 1.0076326167154306, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.9549109935760498, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8606656789779663, + "num_tokens": 302173682.0, + "step": 7921 + }, + { + "epoch": 1.0077598269940211, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 1.923600435256958, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8598655462265015, + "num_tokens": 302207390.0, + "step": 7922 + }, + { + "epoch": 1.0078870372726116, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9025447368621826, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8672055006027222, + "num_tokens": 302244440.0, + "step": 7923 + }, + { + "epoch": 1.0080142475512022, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.876848578453064, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8574070930480957, + "num_tokens": 302281058.0, + "step": 7924 + }, + { + "epoch": 1.0081414578297927, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9511867761611938, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8784891963005066, + "num_tokens": 302319586.0, + "step": 7925 + }, + { + "epoch": 1.0082686681083832, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.8787403106689453, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8570669889450073, + "num_tokens": 302361356.0, + "step": 7926 + }, + { + "epoch": 1.0083958783869738, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8126535415649414, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8728693723678589, + "num_tokens": 302399974.0, + "step": 7927 + }, + { + "epoch": 1.0085230886655643, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.8157691955566406, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8660184144973755, + "num_tokens": 302435709.0, + "step": 7928 + }, + { + "epoch": 1.0086502989441546, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.868162751197815, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8585917949676514, + "num_tokens": 302474151.0, + "step": 7929 + }, + { + "epoch": 1.0087775092227451, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9535576105117798, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8652207851409912, + "num_tokens": 302509877.0, + "step": 7930 + }, + { + "epoch": 1.0089047195013356, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.8918696641921997, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8746604919433594, + "num_tokens": 302547689.0, + "step": 7931 + }, + { + "epoch": 1.0090319297799262, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.7863123416900635, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8610153198242188, + "num_tokens": 302590728.0, + "step": 7932 + }, + { + "epoch": 1.0091591400585167, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.0671286582946777, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8401482105255127, + "num_tokens": 302625709.0, + "step": 7933 + }, + { + "epoch": 1.0092863503371072, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.1924004554748535, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8700249195098877, + "num_tokens": 302662086.0, + "step": 7934 + }, + { + "epoch": 1.0094135606156978, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8465781211853027, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8798777461051941, + "num_tokens": 302703680.0, + "step": 7935 + }, + { + "epoch": 1.0095407708942883, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.965556025505066, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8464419841766357, + "num_tokens": 302740426.0, + "step": 7936 + }, + { + "epoch": 1.0096679811728788, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8979884386062622, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8750008940696716, + "num_tokens": 302780778.0, + "step": 7937 + }, + { + "epoch": 1.0097951914514693, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9798330068588257, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8670098781585693, + "num_tokens": 302816012.0, + "step": 7938 + }, + { + "epoch": 1.0099224017300599, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9519164562225342, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8414280414581299, + "num_tokens": 302858325.0, + "step": 7939 + }, + { + "epoch": 1.0100496120086504, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9740195274353027, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8632206916809082, + "num_tokens": 302898210.0, + "step": 7940 + }, + { + "epoch": 1.0101768222872407, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.0277962684631348, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8638875484466553, + "num_tokens": 302936072.0, + "step": 7941 + }, + { + "epoch": 1.0103040325658312, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.005774974822998, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8533584475517273, + "num_tokens": 302978941.0, + "step": 7942 + }, + { + "epoch": 1.0104312428444218, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9375900030136108, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8550755977630615, + "num_tokens": 303016664.0, + "step": 7943 + }, + { + "epoch": 1.0105584531230123, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.7892780303955078, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.859036922454834, + "num_tokens": 303063313.0, + "step": 7944 + }, + { + "epoch": 1.0106856634016028, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9242225885391235, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8521319031715393, + "num_tokens": 303103786.0, + "step": 7945 + }, + { + "epoch": 1.0108128736801933, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9655840396881104, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8700438141822815, + "num_tokens": 303136928.0, + "step": 7946 + }, + { + "epoch": 1.0109400839587839, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.0247457027435303, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.871498167514801, + "num_tokens": 303174854.0, + "step": 7947 + }, + { + "epoch": 1.0110672942373744, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8847074508666992, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8600937128067017, + "num_tokens": 303214234.0, + "step": 7948 + }, + { + "epoch": 1.011194504515965, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.8188748359680176, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8761079907417297, + "num_tokens": 303251830.0, + "step": 7949 + }, + { + "epoch": 1.0113217147945555, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.816209077835083, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8530632853507996, + "num_tokens": 303291564.0, + "step": 7950 + }, + { + "epoch": 1.011448925073146, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.926364779472351, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.859804093837738, + "num_tokens": 303328642.0, + "step": 7951 + }, + { + "epoch": 1.0115761353517365, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.8665473461151123, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8609384298324585, + "num_tokens": 303373693.0, + "step": 7952 + }, + { + "epoch": 1.0117033456303268, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9881501197814941, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8669456839561462, + "num_tokens": 303407124.0, + "step": 7953 + }, + { + "epoch": 1.0118305559089174, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.006688117980957, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.873856246471405, + "num_tokens": 303441586.0, + "step": 7954 + }, + { + "epoch": 1.0119577661875079, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.059856653213501, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8642721176147461, + "num_tokens": 303473593.0, + "step": 7955 + }, + { + "epoch": 1.0120849764660984, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.983432650566101, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8596676588058472, + "num_tokens": 303510239.0, + "step": 7956 + }, + { + "epoch": 1.012212186744689, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.0348122119903564, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8612080216407776, + "num_tokens": 303544709.0, + "step": 7957 + }, + { + "epoch": 1.0123393970232795, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8681604862213135, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8718931078910828, + "num_tokens": 303581990.0, + "step": 7958 + }, + { + "epoch": 1.01246660730187, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.790128469467163, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8664151430130005, + "num_tokens": 303625167.0, + "step": 7959 + }, + { + "epoch": 1.0125938175804605, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9537667036056519, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8644499778747559, + "num_tokens": 303664751.0, + "step": 7960 + }, + { + "epoch": 1.012721027859051, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9778831005096436, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8605167865753174, + "num_tokens": 303702190.0, + "step": 7961 + }, + { + "epoch": 1.0128482381376416, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.796849012374878, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.877252459526062, + "num_tokens": 303742872.0, + "step": 7962 + }, + { + "epoch": 1.012975448416232, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.950945496559143, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8553020358085632, + "num_tokens": 303781265.0, + "step": 7963 + }, + { + "epoch": 1.0131026586948226, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8162269592285156, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8631166815757751, + "num_tokens": 303820686.0, + "step": 7964 + }, + { + "epoch": 1.0132298689734132, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.7057209014892578, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8651335835456848, + "num_tokens": 303863428.0, + "step": 7965 + }, + { + "epoch": 1.0133570792520035, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 7.691220760345459, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8587861061096191, + "num_tokens": 303903847.0, + "step": 7966 + }, + { + "epoch": 1.013484289530594, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.984203815460205, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.854644775390625, + "num_tokens": 303943083.0, + "step": 7967 + }, + { + "epoch": 1.0136114998091845, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.833288311958313, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8729588389396667, + "num_tokens": 303978773.0, + "step": 7968 + }, + { + "epoch": 1.013738710087775, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8134095668792725, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.869581937789917, + "num_tokens": 304017058.0, + "step": 7969 + }, + { + "epoch": 1.0138659203663656, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.0233685970306396, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8532437086105347, + "num_tokens": 304054104.0, + "step": 7970 + }, + { + "epoch": 1.013993130644956, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8212617635726929, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8785490393638611, + "num_tokens": 304091100.0, + "step": 7971 + }, + { + "epoch": 1.0141203409235466, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8856436014175415, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8638463616371155, + "num_tokens": 304127747.0, + "step": 7972 + }, + { + "epoch": 1.0142475512021372, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 1.7377647161483765, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8634903430938721, + "num_tokens": 304171089.0, + "step": 7973 + }, + { + "epoch": 1.0143747614807277, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 1.8928380012512207, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8537125587463379, + "num_tokens": 304210207.0, + "step": 7974 + }, + { + "epoch": 1.0145019717593182, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 1.8768327236175537, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.879771888256073, + "num_tokens": 304249423.0, + "step": 7975 + }, + { + "epoch": 1.0146291820379088, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9178835153579712, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8675373792648315, + "num_tokens": 304286872.0, + "step": 7976 + }, + { + "epoch": 1.0147563923164993, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9886010885238647, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8663737177848816, + "num_tokens": 304323033.0, + "step": 7977 + }, + { + "epoch": 1.0148836025950896, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9972585439682007, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8662666082382202, + "num_tokens": 304360096.0, + "step": 7978 + }, + { + "epoch": 1.0150108128736801, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8677799701690674, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8830456733703613, + "num_tokens": 304396364.0, + "step": 7979 + }, + { + "epoch": 1.0151380231522706, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9108901023864746, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8656983971595764, + "num_tokens": 304431583.0, + "step": 7980 + }, + { + "epoch": 1.0152652334308612, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 7.7880449295043945, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8578293323516846, + "num_tokens": 304470531.0, + "step": 7981 + }, + { + "epoch": 1.0153924437094517, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.2098703384399414, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.867135763168335, + "num_tokens": 304505187.0, + "step": 7982 + }, + { + "epoch": 1.0155196539880422, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.0647687911987305, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8403934240341187, + "num_tokens": 304542685.0, + "step": 7983 + }, + { + "epoch": 1.0156468642666328, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9202393293380737, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8626944422721863, + "num_tokens": 304579204.0, + "step": 7984 + }, + { + "epoch": 1.0157740745452233, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9020695686340332, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.870063066482544, + "num_tokens": 304615224.0, + "step": 7985 + }, + { + "epoch": 1.0159012848238138, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8596878051757812, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8578946590423584, + "num_tokens": 304653513.0, + "step": 7986 + }, + { + "epoch": 1.0160284951024043, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9797565937042236, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8691574931144714, + "num_tokens": 304686744.0, + "step": 7987 + }, + { + "epoch": 1.0161557053809949, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.822532296180725, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8624191284179688, + "num_tokens": 304727262.0, + "step": 7988 + }, + { + "epoch": 1.0162829156595854, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9391164779663086, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8624945282936096, + "num_tokens": 304763418.0, + "step": 7989 + }, + { + "epoch": 1.0164101259381757, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.7707219123840332, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8662369251251221, + "num_tokens": 304800763.0, + "step": 7990 + }, + { + "epoch": 1.0165373362167662, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9628651142120361, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8825061917304993, + "num_tokens": 304833136.0, + "step": 7991 + }, + { + "epoch": 1.0166645464953568, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.936793565750122, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8718469142913818, + "num_tokens": 304873208.0, + "step": 7992 + }, + { + "epoch": 1.0167917567739473, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.0126640796661377, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8640751838684082, + "num_tokens": 304914237.0, + "step": 7993 + }, + { + "epoch": 1.0169189670525378, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.8389555215835571, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8789997696876526, + "num_tokens": 304955453.0, + "step": 7994 + }, + { + "epoch": 1.0170461773311283, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.9520758390426636, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8627570867538452, + "num_tokens": 304990533.0, + "step": 7995 + }, + { + "epoch": 1.0171733876097189, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.977513074874878, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8627591133117676, + "num_tokens": 305024849.0, + "step": 7996 + }, + { + "epoch": 1.0173005978883094, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9280198812484741, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8603844046592712, + "num_tokens": 305061615.0, + "step": 7997 + }, + { + "epoch": 1.0174278081669, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8868615627288818, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8440386056900024, + "num_tokens": 305102251.0, + "step": 7998 + }, + { + "epoch": 1.0175550184454905, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.0380020141601562, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8734958171844482, + "num_tokens": 305135405.0, + "step": 7999 + }, + { + "epoch": 1.017682228724081, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.0194873809814453, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8626757860183716, + "num_tokens": 305169005.0, + "step": 8000 + }, + { + "epoch": 1.0178094390026715, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.7907637357711792, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8599948287010193, + "num_tokens": 305209050.0, + "step": 8001 + }, + { + "epoch": 1.0179366492812618, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.099670648574829, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8644331693649292, + "num_tokens": 305238250.0, + "step": 8002 + }, + { + "epoch": 1.0180638595598523, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.034804344177246, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8671082258224487, + "num_tokens": 305283544.0, + "step": 8003 + }, + { + "epoch": 1.0181910698384429, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9689664840698242, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8454163074493408, + "num_tokens": 305320548.0, + "step": 8004 + }, + { + "epoch": 1.0183182801170334, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9932222366333008, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8702309131622314, + "num_tokens": 305359560.0, + "step": 8005 + }, + { + "epoch": 1.018445490395624, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.104471206665039, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8596408367156982, + "num_tokens": 305396680.0, + "step": 8006 + }, + { + "epoch": 1.0185727006742145, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8000152111053467, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8737250566482544, + "num_tokens": 305432856.0, + "step": 8007 + }, + { + "epoch": 1.018699910952805, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.869856595993042, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8649817705154419, + "num_tokens": 305469973.0, + "step": 8008 + }, + { + "epoch": 1.0188271212313955, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9088715314865112, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8673778772354126, + "num_tokens": 305505576.0, + "step": 8009 + }, + { + "epoch": 1.018954331509986, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.082456588745117, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8438162207603455, + "num_tokens": 305542037.0, + "step": 8010 + }, + { + "epoch": 1.0190815417885766, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.554701328277588, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8707959651947021, + "num_tokens": 305587784.0, + "step": 8011 + }, + { + "epoch": 1.019208752067167, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9530571699142456, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.856033205986023, + "num_tokens": 305621888.0, + "step": 8012 + }, + { + "epoch": 1.0193359623457576, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.6898373365402222, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8832803964614868, + "num_tokens": 305663844.0, + "step": 8013 + }, + { + "epoch": 1.0194631726243482, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9625496864318848, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8783129453659058, + "num_tokens": 305695913.0, + "step": 8014 + }, + { + "epoch": 1.0195903829029385, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 1.941432237625122, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8618246912956238, + "num_tokens": 305733632.0, + "step": 8015 + }, + { + "epoch": 1.019717593181529, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.0823047161102295, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8485841155052185, + "num_tokens": 305766917.0, + "step": 8016 + }, + { + "epoch": 1.0198448034601195, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9166806936264038, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.859044075012207, + "num_tokens": 305805768.0, + "step": 8017 + }, + { + "epoch": 1.01997201373871, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 1.8267269134521484, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8666257858276367, + "num_tokens": 305843412.0, + "step": 8018 + }, + { + "epoch": 1.0200992240173006, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 1.741906762123108, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8796131014823914, + "num_tokens": 305880523.0, + "step": 8019 + }, + { + "epoch": 1.020226434295891, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 3.092541456222534, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8652052879333496, + "num_tokens": 305915379.0, + "step": 8020 + }, + { + "epoch": 1.0203536445744816, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.2660131454467773, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8781036138534546, + "num_tokens": 305952471.0, + "step": 8021 + }, + { + "epoch": 1.0204808548530722, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.7589647769927979, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8622912168502808, + "num_tokens": 305994023.0, + "step": 8022 + }, + { + "epoch": 1.0206080651316627, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.946420431137085, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8688156604766846, + "num_tokens": 306026563.0, + "step": 8023 + }, + { + "epoch": 1.0207352754102532, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.7620428800582886, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8594634532928467, + "num_tokens": 306067913.0, + "step": 8024 + }, + { + "epoch": 1.0208624856888437, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 1.8919885158538818, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.862014651298523, + "num_tokens": 306108692.0, + "step": 8025 + }, + { + "epoch": 1.0209896959674343, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.853360652923584, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8637759685516357, + "num_tokens": 306148082.0, + "step": 8026 + }, + { + "epoch": 1.0211169062460246, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9203256368637085, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8711897730827332, + "num_tokens": 306182800.0, + "step": 8027 + }, + { + "epoch": 1.021244116524615, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.7121474742889404, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8608105778694153, + "num_tokens": 306226741.0, + "step": 8028 + }, + { + "epoch": 1.0213713268032056, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.244661569595337, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8626407384872437, + "num_tokens": 306265993.0, + "step": 8029 + }, + { + "epoch": 1.0214985370817962, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 3.0434117317199707, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8671295046806335, + "num_tokens": 306306511.0, + "step": 8030 + }, + { + "epoch": 1.0216257473603867, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.968785285949707, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.879539430141449, + "num_tokens": 306337687.0, + "step": 8031 + }, + { + "epoch": 1.0217529576389772, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8904130458831787, + "learning_rate": 1e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.8297902345657349, + "num_tokens": 306379646.0, + "step": 8032 + }, + { + "epoch": 1.0218801679175677, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8564597368240356, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8646025657653809, + "num_tokens": 306415389.0, + "step": 8033 + }, + { + "epoch": 1.0220073781961583, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.533588409423828, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8598127961158752, + "num_tokens": 306452035.0, + "step": 8034 + }, + { + "epoch": 1.0221345884747488, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.734247088432312, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8630237579345703, + "num_tokens": 306496277.0, + "step": 8035 + }, + { + "epoch": 1.0222617987533393, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8972644805908203, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8692679405212402, + "num_tokens": 306528212.0, + "step": 8036 + }, + { + "epoch": 1.0223890090319299, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9762598276138306, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8675186634063721, + "num_tokens": 306562047.0, + "step": 8037 + }, + { + "epoch": 1.0225162193105204, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.099728584289551, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8535060882568359, + "num_tokens": 306601298.0, + "step": 8038 + }, + { + "epoch": 1.0226434295891107, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9192346334457397, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8634048700332642, + "num_tokens": 306640961.0, + "step": 8039 + }, + { + "epoch": 1.0227706398677012, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.7482647895812988, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8812031745910645, + "num_tokens": 306679507.0, + "step": 8040 + }, + { + "epoch": 1.0228978501462918, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.0647547245025635, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8542730212211609, + "num_tokens": 306717337.0, + "step": 8041 + }, + { + "epoch": 1.0230250604248823, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.3101296424865723, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8380821347236633, + "num_tokens": 306754152.0, + "step": 8042 + }, + { + "epoch": 1.0231522707034728, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9322196245193481, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8570676445960999, + "num_tokens": 306794887.0, + "step": 8043 + }, + { + "epoch": 1.0232794809820633, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9385089874267578, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8666690587997437, + "num_tokens": 306828374.0, + "step": 8044 + }, + { + "epoch": 1.0234066912606539, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8634032011032104, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8662993907928467, + "num_tokens": 306865285.0, + "step": 8045 + }, + { + "epoch": 1.0235339015392444, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8634259700775146, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8699899911880493, + "num_tokens": 306897195.0, + "step": 8046 + }, + { + "epoch": 1.023661111817835, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8204419612884521, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8630362153053284, + "num_tokens": 306935352.0, + "step": 8047 + }, + { + "epoch": 1.0237883220964255, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.0532288551330566, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.868495523929596, + "num_tokens": 306964621.0, + "step": 8048 + }, + { + "epoch": 1.023915532375016, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.0943663120269775, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8625909686088562, + "num_tokens": 307003451.0, + "step": 8049 + }, + { + "epoch": 1.0240427426536065, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.989593267440796, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8646125793457031, + "num_tokens": 307044695.0, + "step": 8050 + }, + { + "epoch": 1.0241699529321968, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9209060668945312, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8698378205299377, + "num_tokens": 307080603.0, + "step": 8051 + }, + { + "epoch": 1.0242971632107873, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9818360805511475, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8561376929283142, + "num_tokens": 307122300.0, + "step": 8052 + }, + { + "epoch": 1.0244243734893779, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8675198554992676, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8702578544616699, + "num_tokens": 307158685.0, + "step": 8053 + }, + { + "epoch": 1.0245515837679684, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.842681884765625, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8681819438934326, + "num_tokens": 307199104.0, + "step": 8054 + }, + { + "epoch": 1.024678794046559, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9353852272033691, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8573521375656128, + "num_tokens": 307235840.0, + "step": 8055 + }, + { + "epoch": 1.0248060043251495, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9830540418624878, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8524549007415771, + "num_tokens": 307273089.0, + "step": 8056 + }, + { + "epoch": 1.02493321460374, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8198920488357544, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8638471364974976, + "num_tokens": 307318825.0, + "step": 8057 + }, + { + "epoch": 1.0250604248823305, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8942928314208984, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8692677021026611, + "num_tokens": 307357006.0, + "step": 8058 + }, + { + "epoch": 1.025187635160921, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 16.617956161499023, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8603063821792603, + "num_tokens": 307390274.0, + "step": 8059 + }, + { + "epoch": 1.0253148454395116, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.039658784866333, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.871783971786499, + "num_tokens": 307427157.0, + "step": 8060 + }, + { + "epoch": 1.025442055718102, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 1.9283329248428345, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8601762652397156, + "num_tokens": 307466555.0, + "step": 8061 + }, + { + "epoch": 1.0255692659966926, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9242290258407593, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8559424877166748, + "num_tokens": 307503375.0, + "step": 8062 + }, + { + "epoch": 1.0256964762752832, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9219236373901367, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8724686503410339, + "num_tokens": 307543170.0, + "step": 8063 + }, + { + "epoch": 1.0258236865538735, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9704091548919678, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8602151870727539, + "num_tokens": 307582890.0, + "step": 8064 + }, + { + "epoch": 1.025950896832464, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.0349795818328857, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8560064435005188, + "num_tokens": 307617643.0, + "step": 8065 + }, + { + "epoch": 1.0260781071110545, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9527848958969116, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8615013360977173, + "num_tokens": 307652258.0, + "step": 8066 + }, + { + "epoch": 1.026205317389645, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8404191732406616, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8601005673408508, + "num_tokens": 307693245.0, + "step": 8067 + }, + { + "epoch": 1.0263325276682356, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8989030122756958, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8662081956863403, + "num_tokens": 307728751.0, + "step": 8068 + }, + { + "epoch": 1.026459737946826, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8606641292572021, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8526675701141357, + "num_tokens": 307767869.0, + "step": 8069 + }, + { + "epoch": 1.0265869482254166, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9639098644256592, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8735747933387756, + "num_tokens": 307809438.0, + "step": 8070 + }, + { + "epoch": 1.0267141585040072, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.00500226020813, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.843536913394928, + "num_tokens": 307848752.0, + "step": 8071 + }, + { + "epoch": 1.0268413687825977, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.9161657094955444, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8574077486991882, + "num_tokens": 307886080.0, + "step": 8072 + }, + { + "epoch": 1.0269685790611882, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.743914842605591, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.875743567943573, + "num_tokens": 307922854.0, + "step": 8073 + }, + { + "epoch": 1.0270957893397787, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9516491889953613, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8601535558700562, + "num_tokens": 307960032.0, + "step": 8074 + }, + { + "epoch": 1.0272229996183693, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.103170394897461, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8623444437980652, + "num_tokens": 307996074.0, + "step": 8075 + }, + { + "epoch": 1.0273502098969596, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8648059368133545, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.877191424369812, + "num_tokens": 308030833.0, + "step": 8076 + }, + { + "epoch": 1.02747742017555, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.7293566465377808, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8722983598709106, + "num_tokens": 308079620.0, + "step": 8077 + }, + { + "epoch": 1.0276046304541406, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 16.615581512451172, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8575147390365601, + "num_tokens": 308118555.0, + "step": 8078 + }, + { + "epoch": 1.0277318407327312, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.8924810886383057, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8464764356613159, + "num_tokens": 308154410.0, + "step": 8079 + }, + { + "epoch": 1.0278590510113217, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.189450740814209, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.864535927772522, + "num_tokens": 308191990.0, + "step": 8080 + }, + { + "epoch": 1.0279862612899122, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 1.9321399927139282, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8439532518386841, + "num_tokens": 308231047.0, + "step": 8081 + }, + { + "epoch": 1.0281134715685027, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 1.7722141742706299, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8786716461181641, + "num_tokens": 308269487.0, + "step": 8082 + }, + { + "epoch": 1.0282406818470933, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9559111595153809, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8622442483901978, + "num_tokens": 308304974.0, + "step": 8083 + }, + { + "epoch": 1.0283678921256838, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8280287981033325, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8842020034790039, + "num_tokens": 308342072.0, + "step": 8084 + }, + { + "epoch": 1.0284951024042743, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9660518169403076, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8588584065437317, + "num_tokens": 308374293.0, + "step": 8085 + }, + { + "epoch": 1.0286223126828649, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8273290395736694, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8674342632293701, + "num_tokens": 308415872.0, + "step": 8086 + }, + { + "epoch": 1.0287495229614554, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.0007545948028564, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8566077351570129, + "num_tokens": 308451198.0, + "step": 8087 + }, + { + "epoch": 1.0288767332400457, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 1.9301347732543945, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8716796636581421, + "num_tokens": 308490554.0, + "step": 8088 + }, + { + "epoch": 1.0290039435186362, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.8093599081039429, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8585624694824219, + "num_tokens": 308530221.0, + "step": 8089 + }, + { + "epoch": 1.0291311537972267, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.005258798599243, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.848197340965271, + "num_tokens": 308563634.0, + "step": 8090 + }, + { + "epoch": 1.0292583640758173, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8611429929733276, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8628828525543213, + "num_tokens": 308605033.0, + "step": 8091 + }, + { + "epoch": 1.0293855743544078, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9063947200775146, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8647453784942627, + "num_tokens": 308642603.0, + "step": 8092 + }, + { + "epoch": 1.0295127846329983, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.7377851009368896, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8609166145324707, + "num_tokens": 308685189.0, + "step": 8093 + }, + { + "epoch": 1.0296399949115889, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 1.9111027717590332, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8644511699676514, + "num_tokens": 308726052.0, + "step": 8094 + }, + { + "epoch": 1.0297672051901794, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.927115797996521, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8507565259933472, + "num_tokens": 308764828.0, + "step": 8095 + }, + { + "epoch": 1.02989441546877, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9086673259735107, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.872944712638855, + "num_tokens": 308799712.0, + "step": 8096 + }, + { + "epoch": 1.0300216257473604, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8850189447402954, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8635672330856323, + "num_tokens": 308839879.0, + "step": 8097 + }, + { + "epoch": 1.030148836025951, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9091275930404663, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8557076454162598, + "num_tokens": 308880362.0, + "step": 8098 + }, + { + "epoch": 1.0302760463045415, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.7890725135803223, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8733415603637695, + "num_tokens": 308921395.0, + "step": 8099 + }, + { + "epoch": 1.0304032565831318, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8542852401733398, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8715319633483887, + "num_tokens": 308959976.0, + "step": 8100 + }, + { + "epoch": 1.0305304668617223, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8190524578094482, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8752380609512329, + "num_tokens": 309000702.0, + "step": 8101 + }, + { + "epoch": 1.0306576771403129, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8075413703918457, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8823988437652588, + "num_tokens": 309036540.0, + "step": 8102 + }, + { + "epoch": 1.0307848874189034, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.798973560333252, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8625171184539795, + "num_tokens": 309072408.0, + "step": 8103 + }, + { + "epoch": 1.030912097697494, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.030482053756714, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.848876953125, + "num_tokens": 309110366.0, + "step": 8104 + }, + { + "epoch": 1.0310393079760845, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 1.709725260734558, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8665894269943237, + "num_tokens": 309155194.0, + "step": 8105 + }, + { + "epoch": 1.031166518254675, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9210108518600464, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8719775080680847, + "num_tokens": 309193747.0, + "step": 8106 + }, + { + "epoch": 1.0312937285332655, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.71652889251709, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8506852388381958, + "num_tokens": 309234161.0, + "step": 8107 + }, + { + "epoch": 1.031420938811856, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 1.898888349533081, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8527958989143372, + "num_tokens": 309272574.0, + "step": 8108 + }, + { + "epoch": 1.0315481490904466, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9751726388931274, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8652573823928833, + "num_tokens": 309306567.0, + "step": 8109 + }, + { + "epoch": 1.031675359369037, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8263646364212036, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8689804077148438, + "num_tokens": 309347218.0, + "step": 8110 + }, + { + "epoch": 1.0318025696476276, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 16.60618782043457, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8778378367424011, + "num_tokens": 309385701.0, + "step": 8111 + }, + { + "epoch": 1.0319297799262181, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9646263122558594, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8612695932388306, + "num_tokens": 309424600.0, + "step": 8112 + }, + { + "epoch": 1.0320569902048085, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.063997507095337, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8441771268844604, + "num_tokens": 309459929.0, + "step": 8113 + }, + { + "epoch": 1.032184200483399, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 1.7892793416976929, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8547251224517822, + "num_tokens": 309505085.0, + "step": 8114 + }, + { + "epoch": 1.0323114107619895, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 1.8286457061767578, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.865958034992218, + "num_tokens": 309544746.0, + "step": 8115 + }, + { + "epoch": 1.03243862104058, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.0282938480377197, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8497303128242493, + "num_tokens": 309584301.0, + "step": 8116 + }, + { + "epoch": 1.0325658313191706, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9129503965377808, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8662742376327515, + "num_tokens": 309622570.0, + "step": 8117 + }, + { + "epoch": 1.032693041597761, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9419121742248535, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8476570844650269, + "num_tokens": 309666417.0, + "step": 8118 + }, + { + "epoch": 1.0328202518763516, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9110900163650513, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8535621166229248, + "num_tokens": 309705706.0, + "step": 8119 + }, + { + "epoch": 1.0329474621549422, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8448352813720703, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8717925548553467, + "num_tokens": 309744288.0, + "step": 8120 + }, + { + "epoch": 1.0330746724335327, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.063081979751587, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8507586121559143, + "num_tokens": 309780406.0, + "step": 8121 + }, + { + "epoch": 1.0332018827121232, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.2679073810577393, + "learning_rate": 1e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8311866521835327, + "num_tokens": 309811135.0, + "step": 8122 + }, + { + "epoch": 1.0333290929907137, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9317439794540405, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.868156373500824, + "num_tokens": 309847037.0, + "step": 8123 + }, + { + "epoch": 1.0334563032693043, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.8872363567352295, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8640868067741394, + "num_tokens": 309884274.0, + "step": 8124 + }, + { + "epoch": 1.0335835135478946, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.9914008378982544, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8758994340896606, + "num_tokens": 309915572.0, + "step": 8125 + }, + { + "epoch": 1.033710723826485, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.788830041885376, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8682920336723328, + "num_tokens": 309954502.0, + "step": 8126 + }, + { + "epoch": 1.0338379341050756, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7166589498519897, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8844913244247437, + "num_tokens": 309993015.0, + "step": 8127 + }, + { + "epoch": 1.0339651443836662, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.96770441532135, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8535891771316528, + "num_tokens": 310030840.0, + "step": 8128 + }, + { + "epoch": 1.0340923546622567, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 80.52130126953125, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8643378615379333, + "num_tokens": 310067586.0, + "step": 8129 + }, + { + "epoch": 1.0342195649408472, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0323362350463867, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8723375201225281, + "num_tokens": 310104496.0, + "step": 8130 + }, + { + "epoch": 1.0343467752194377, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.944572925567627, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8466700315475464, + "num_tokens": 310141162.0, + "step": 8131 + }, + { + "epoch": 1.0344739854980283, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8631726503372192, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8610711693763733, + "num_tokens": 310182965.0, + "step": 8132 + }, + { + "epoch": 1.0346011957766188, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0029947757720947, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8447962999343872, + "num_tokens": 310219572.0, + "step": 8133 + }, + { + "epoch": 1.0347284060552093, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0937345027923584, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8515478372573853, + "num_tokens": 310248711.0, + "step": 8134 + }, + { + "epoch": 1.0348556163337999, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0970284938812256, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8480563759803772, + "num_tokens": 310287915.0, + "step": 8135 + }, + { + "epoch": 1.0349828266123904, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0021989345550537, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.861221194267273, + "num_tokens": 310325619.0, + "step": 8136 + }, + { + "epoch": 1.0351100368909807, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9440619945526123, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8705737590789795, + "num_tokens": 310362456.0, + "step": 8137 + }, + { + "epoch": 1.0352372471695712, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9858007431030273, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8578904271125793, + "num_tokens": 310399584.0, + "step": 8138 + }, + { + "epoch": 1.0353644574481617, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0297203063964844, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8568938970565796, + "num_tokens": 310437435.0, + "step": 8139 + }, + { + "epoch": 1.0354916677267523, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8682811260223389, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8683449029922485, + "num_tokens": 310476320.0, + "step": 8140 + }, + { + "epoch": 1.0356188780053428, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.799804925918579, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8786050081253052, + "num_tokens": 310512956.0, + "step": 8141 + }, + { + "epoch": 1.0357460882839333, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9978965520858765, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8611292839050293, + "num_tokens": 310547605.0, + "step": 8142 + }, + { + "epoch": 1.0358732985625239, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.88441801071167, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8608047366142273, + "num_tokens": 310586991.0, + "step": 8143 + }, + { + "epoch": 1.0360005088411144, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9895013570785522, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8629271984100342, + "num_tokens": 310621271.0, + "step": 8144 + }, + { + "epoch": 1.036127719119705, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9170784950256348, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8667244911193848, + "num_tokens": 310662154.0, + "step": 8145 + }, + { + "epoch": 1.0362549293982954, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8898541927337646, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8724147081375122, + "num_tokens": 310701924.0, + "step": 8146 + }, + { + "epoch": 1.036382139676886, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.802121639251709, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8689827919006348, + "num_tokens": 310739688.0, + "step": 8147 + }, + { + "epoch": 1.0365093499554765, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9237505197525024, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8564205765724182, + "num_tokens": 310773317.0, + "step": 8148 + }, + { + "epoch": 1.0366365602340668, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9482252597808838, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8601750731468201, + "num_tokens": 310810854.0, + "step": 8149 + }, + { + "epoch": 1.0367637705126573, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0905849933624268, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8563554883003235, + "num_tokens": 310846080.0, + "step": 8150 + }, + { + "epoch": 1.0368909807912479, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9422334432601929, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.840424120426178, + "num_tokens": 310885032.0, + "step": 8151 + }, + { + "epoch": 1.0370181910698384, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8837982416152954, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.868224024772644, + "num_tokens": 310924518.0, + "step": 8152 + }, + { + "epoch": 1.037145401348429, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9665791988372803, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8644304275512695, + "num_tokens": 310961734.0, + "step": 8153 + }, + { + "epoch": 1.0372726116270194, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8384137153625488, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8662398457527161, + "num_tokens": 311004541.0, + "step": 8154 + }, + { + "epoch": 1.03739982190561, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.733174443244934, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8713209629058838, + "num_tokens": 311043947.0, + "step": 8155 + }, + { + "epoch": 1.0375270321842005, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.85784113407135, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8595336675643921, + "num_tokens": 311083154.0, + "step": 8156 + }, + { + "epoch": 1.037654242462791, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8654087781906128, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8752026557922363, + "num_tokens": 311121429.0, + "step": 8157 + }, + { + "epoch": 1.0377814527413816, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.731213092803955, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8481190204620361, + "num_tokens": 311166578.0, + "step": 8158 + }, + { + "epoch": 1.037908663019972, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8496969938278198, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8655636310577393, + "num_tokens": 311209645.0, + "step": 8159 + }, + { + "epoch": 1.0380358732985626, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0586025714874268, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8682156801223755, + "num_tokens": 311248428.0, + "step": 8160 + }, + { + "epoch": 1.0381630835771531, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0209784507751465, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8640380501747131, + "num_tokens": 311282040.0, + "step": 8161 + }, + { + "epoch": 1.0382902938557435, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8498972654342651, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8696334958076477, + "num_tokens": 311320233.0, + "step": 8162 + }, + { + "epoch": 1.038417504134334, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.88931143283844, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8488209843635559, + "num_tokens": 311357586.0, + "step": 8163 + }, + { + "epoch": 1.0385447144129245, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8863630294799805, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.862856388092041, + "num_tokens": 311395545.0, + "step": 8164 + }, + { + "epoch": 1.038671924691515, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9920827150344849, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8628772497177124, + "num_tokens": 311428785.0, + "step": 8165 + }, + { + "epoch": 1.0387991349701056, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8650916814804077, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8504352569580078, + "num_tokens": 311469546.0, + "step": 8166 + }, + { + "epoch": 1.038926345248696, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9172497987747192, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.855703592300415, + "num_tokens": 311504064.0, + "step": 8167 + }, + { + "epoch": 1.0390535555272866, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9967052936553955, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8444172143936157, + "num_tokens": 311536600.0, + "step": 8168 + }, + { + "epoch": 1.0391807658058771, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8515783548355103, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.879359245300293, + "num_tokens": 311573638.0, + "step": 8169 + }, + { + "epoch": 1.0393079760844677, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8646557331085205, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8563392758369446, + "num_tokens": 311611085.0, + "step": 8170 + }, + { + "epoch": 1.0394351863630582, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.914787769317627, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8562574982643127, + "num_tokens": 311655497.0, + "step": 8171 + }, + { + "epoch": 1.0395623966416487, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9356318712234497, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.848656177520752, + "num_tokens": 311688167.0, + "step": 8172 + }, + { + "epoch": 1.0396896069202393, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9777114391326904, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8626111745834351, + "num_tokens": 311724926.0, + "step": 8173 + }, + { + "epoch": 1.0398168171988296, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.018963098526001, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8443028330802917, + "num_tokens": 311760729.0, + "step": 8174 + }, + { + "epoch": 1.03994402747742, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.887407660484314, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8627544045448303, + "num_tokens": 311802138.0, + "step": 8175 + }, + { + "epoch": 1.0400712377560106, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.201094150543213, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8691765069961548, + "num_tokens": 311838510.0, + "step": 8176 + }, + { + "epoch": 1.0401984480346012, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.024535894393921, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8560810685157776, + "num_tokens": 311870875.0, + "step": 8177 + }, + { + "epoch": 1.0403256583131917, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9559929370880127, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8752961158752441, + "num_tokens": 311905854.0, + "step": 8178 + }, + { + "epoch": 1.0404528685917822, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.1018476486206055, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8606246113777161, + "num_tokens": 311936381.0, + "step": 8179 + }, + { + "epoch": 1.0405800788703727, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.069805145263672, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8572984933853149, + "num_tokens": 311972529.0, + "step": 8180 + }, + { + "epoch": 1.0407072891489633, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.067265748977661, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.880439281463623, + "num_tokens": 312007883.0, + "step": 8181 + }, + { + "epoch": 1.0408344994275538, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9902607202529907, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8582367897033691, + "num_tokens": 312041000.0, + "step": 8182 + }, + { + "epoch": 1.0409617097061443, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8739323616027832, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8656492233276367, + "num_tokens": 312076011.0, + "step": 8183 + }, + { + "epoch": 1.0410889199847349, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9423177242279053, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8564577102661133, + "num_tokens": 312115614.0, + "step": 8184 + }, + { + "epoch": 1.0412161302633254, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0145158767700195, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8565883636474609, + "num_tokens": 312154111.0, + "step": 8185 + }, + { + "epoch": 1.0413433405419157, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9386452436447144, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.856731653213501, + "num_tokens": 312191571.0, + "step": 8186 + }, + { + "epoch": 1.0414705508205062, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8442093133926392, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8559145927429199, + "num_tokens": 312229205.0, + "step": 8187 + }, + { + "epoch": 1.0415977610990967, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8404394388198853, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8772709369659424, + "num_tokens": 312266604.0, + "step": 8188 + }, + { + "epoch": 1.0417249713776873, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.025826930999756, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8509522676467896, + "num_tokens": 312305222.0, + "step": 8189 + }, + { + "epoch": 1.0418521816562778, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.985445261001587, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.840190589427948, + "num_tokens": 312345823.0, + "step": 8190 + }, + { + "epoch": 1.0419793919348683, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8893940448760986, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8537087440490723, + "num_tokens": 312383569.0, + "step": 8191 + }, + { + "epoch": 1.0421066022134589, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7430766820907593, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8658922910690308, + "num_tokens": 312423068.0, + "step": 8192 + }, + { + "epoch": 1.0422338124920494, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9417834281921387, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.857346773147583, + "num_tokens": 312460569.0, + "step": 8193 + }, + { + "epoch": 1.04236102277064, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.019940137863159, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8403213024139404, + "num_tokens": 312501654.0, + "step": 8194 + }, + { + "epoch": 1.0424882330492304, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.930196762084961, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8541062474250793, + "num_tokens": 312543630.0, + "step": 8195 + }, + { + "epoch": 1.042615443327821, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.992023229598999, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8602402210235596, + "num_tokens": 312579428.0, + "step": 8196 + }, + { + "epoch": 1.0427426536064115, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.7351312637329102, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8755530118942261, + "num_tokens": 312621388.0, + "step": 8197 + }, + { + "epoch": 1.0428698638850018, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.7103443145751953, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8624957799911499, + "num_tokens": 312665303.0, + "step": 8198 + }, + { + "epoch": 1.0429970741635923, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9574626684188843, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.852699339389801, + "num_tokens": 312702354.0, + "step": 8199 + }, + { + "epoch": 1.0431242844421829, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9631866216659546, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8666113615036011, + "num_tokens": 312737223.0, + "step": 8200 + }, + { + "epoch": 1.0432514947207734, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8808788061141968, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8633096218109131, + "num_tokens": 312773353.0, + "step": 8201 + }, + { + "epoch": 1.043378704999364, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.0326766967773438, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8435450792312622, + "num_tokens": 312809230.0, + "step": 8202 + }, + { + "epoch": 1.0435059152779544, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9418549537658691, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8798983097076416, + "num_tokens": 312841278.0, + "step": 8203 + }, + { + "epoch": 1.043633125556545, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8764121532440186, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8671903610229492, + "num_tokens": 312888469.0, + "step": 8204 + }, + { + "epoch": 1.0437603358351355, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9143694639205933, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8692063093185425, + "num_tokens": 312925446.0, + "step": 8205 + }, + { + "epoch": 1.043887546113726, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8211525678634644, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8595269322395325, + "num_tokens": 312965425.0, + "step": 8206 + }, + { + "epoch": 1.0440147563923166, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9715222120285034, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8555514216423035, + "num_tokens": 313000637.0, + "step": 8207 + }, + { + "epoch": 1.044141966670907, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9024802446365356, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8618147373199463, + "num_tokens": 313035986.0, + "step": 8208 + }, + { + "epoch": 1.0442691769494976, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9502168893814087, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8505794405937195, + "num_tokens": 313077027.0, + "step": 8209 + }, + { + "epoch": 1.0443963872280881, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8046798706054688, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8718595504760742, + "num_tokens": 313121733.0, + "step": 8210 + }, + { + "epoch": 1.0445235975066784, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8256739377975464, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8570617437362671, + "num_tokens": 313165542.0, + "step": 8211 + }, + { + "epoch": 1.044650807785269, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.689288854598999, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8504125475883484, + "num_tokens": 313201082.0, + "step": 8212 + }, + { + "epoch": 1.0447780180638595, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7844210863113403, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8730788230895996, + "num_tokens": 313241765.0, + "step": 8213 + }, + { + "epoch": 1.04490522834245, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.904019832611084, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.860413134098053, + "num_tokens": 313282471.0, + "step": 8214 + }, + { + "epoch": 1.0450324386210406, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.082688331604004, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8570551872253418, + "num_tokens": 313331505.0, + "step": 8215 + }, + { + "epoch": 1.045159648899631, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8317322731018066, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8621289134025574, + "num_tokens": 313369823.0, + "step": 8216 + }, + { + "epoch": 1.0452868591782216, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8816038370132446, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8618847131729126, + "num_tokens": 313404255.0, + "step": 8217 + }, + { + "epoch": 1.0454140694568121, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.880114197731018, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8726212978363037, + "num_tokens": 313439368.0, + "step": 8218 + }, + { + "epoch": 1.0455412797354027, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7905218601226807, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.854070782661438, + "num_tokens": 313479475.0, + "step": 8219 + }, + { + "epoch": 1.0456684900139932, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9651089906692505, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8518713712692261, + "num_tokens": 313514604.0, + "step": 8220 + }, + { + "epoch": 1.0457957002925837, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.916152000427246, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8579071164131165, + "num_tokens": 313548907.0, + "step": 8221 + }, + { + "epoch": 1.0459229105711743, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8299404382705688, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8613772988319397, + "num_tokens": 313586615.0, + "step": 8222 + }, + { + "epoch": 1.0460501208497646, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0188634395599365, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8518115282058716, + "num_tokens": 313625202.0, + "step": 8223 + }, + { + "epoch": 1.046177331128355, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.885574221611023, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8555542230606079, + "num_tokens": 313663257.0, + "step": 8224 + }, + { + "epoch": 1.0463045414069456, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8119693994522095, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.866942286491394, + "num_tokens": 313701430.0, + "step": 8225 + }, + { + "epoch": 1.0464317516855361, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8446474075317383, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8733566403388977, + "num_tokens": 313741286.0, + "step": 8226 + }, + { + "epoch": 1.0465589619641267, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0413389205932617, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8463810682296753, + "num_tokens": 313779115.0, + "step": 8227 + }, + { + "epoch": 1.0466861722427172, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7783700227737427, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8669347763061523, + "num_tokens": 313822331.0, + "step": 8228 + }, + { + "epoch": 1.0468133825213077, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8731111288070679, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8567172884941101, + "num_tokens": 313863545.0, + "step": 8229 + }, + { + "epoch": 1.0469405927998983, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9003149271011353, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8824350833892822, + "num_tokens": 313894956.0, + "step": 8230 + }, + { + "epoch": 1.0470678030784888, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.7721714973449707, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8710936903953552, + "num_tokens": 313927920.0, + "step": 8231 + }, + { + "epoch": 1.0471950133570793, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9416563510894775, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.870247483253479, + "num_tokens": 313963230.0, + "step": 8232 + }, + { + "epoch": 1.0473222236356698, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.071256399154663, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8535223007202148, + "num_tokens": 313995951.0, + "step": 8233 + }, + { + "epoch": 1.0474494339142604, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8447015285491943, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8626775741577148, + "num_tokens": 314040928.0, + "step": 8234 + }, + { + "epoch": 1.0475766441928507, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.1032187938690186, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8527325391769409, + "num_tokens": 314080329.0, + "step": 8235 + }, + { + "epoch": 1.0477038544714412, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9622435569763184, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8622521758079529, + "num_tokens": 314117107.0, + "step": 8236 + }, + { + "epoch": 1.0478310647500317, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.0271222591400146, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8588447570800781, + "num_tokens": 314148077.0, + "step": 8237 + }, + { + "epoch": 1.0479582750286223, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.2231662273406982, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8540172576904297, + "num_tokens": 314182064.0, + "step": 8238 + }, + { + "epoch": 1.0480854853072128, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8494287729263306, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8524749279022217, + "num_tokens": 314221667.0, + "step": 8239 + }, + { + "epoch": 1.0482126955858033, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.2177815437316895, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.859870970249176, + "num_tokens": 314255225.0, + "step": 8240 + }, + { + "epoch": 1.0483399058643939, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.177680730819702, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8616611361503601, + "num_tokens": 314288011.0, + "step": 8241 + }, + { + "epoch": 1.0484671161429844, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7169866561889648, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8573085069656372, + "num_tokens": 314332461.0, + "step": 8242 + }, + { + "epoch": 1.048594326421575, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9720290899276733, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8568058609962463, + "num_tokens": 314367967.0, + "step": 8243 + }, + { + "epoch": 1.0487215367001654, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8754527568817139, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.869949221611023, + "num_tokens": 314404487.0, + "step": 8244 + }, + { + "epoch": 1.048848746978756, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.928741216659546, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8604690432548523, + "num_tokens": 314444731.0, + "step": 8245 + }, + { + "epoch": 1.0489759572573465, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.059140205383301, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8634496927261353, + "num_tokens": 314479720.0, + "step": 8246 + }, + { + "epoch": 1.0491031675359368, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9277640581130981, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8785079717636108, + "num_tokens": 314515613.0, + "step": 8247 + }, + { + "epoch": 1.0492303778145273, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.826714038848877, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8597403764724731, + "num_tokens": 314556519.0, + "step": 8248 + }, + { + "epoch": 1.0493575880931179, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.3428194522857666, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8722054362297058, + "num_tokens": 314590803.0, + "step": 8249 + }, + { + "epoch": 1.0494847983717084, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.007025718688965, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8631908297538757, + "num_tokens": 314631347.0, + "step": 8250 + }, + { + "epoch": 1.049612008650299, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8485344648361206, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8607327938079834, + "num_tokens": 314673739.0, + "step": 8251 + }, + { + "epoch": 1.0497392189288894, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7440659999847412, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.861965537071228, + "num_tokens": 314718498.0, + "step": 8252 + }, + { + "epoch": 1.04986642920748, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.868090033531189, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8680543303489685, + "num_tokens": 314758755.0, + "step": 8253 + }, + { + "epoch": 1.0499936394860705, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9474177360534668, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8660080432891846, + "num_tokens": 314793644.0, + "step": 8254 + }, + { + "epoch": 1.050120849764661, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.053610324859619, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8592289090156555, + "num_tokens": 314828045.0, + "step": 8255 + }, + { + "epoch": 1.0502480600432516, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9232934713363647, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.864242672920227, + "num_tokens": 314866088.0, + "step": 8256 + }, + { + "epoch": 1.050375270321842, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 4.05856466293335, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8537197113037109, + "num_tokens": 314904757.0, + "step": 8257 + }, + { + "epoch": 1.0505024806004326, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.1039981842041016, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8541696071624756, + "num_tokens": 314941410.0, + "step": 8258 + }, + { + "epoch": 1.0506296908790231, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7403814792633057, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8696803450584412, + "num_tokens": 314982060.0, + "step": 8259 + }, + { + "epoch": 1.0507569011576134, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8833571672439575, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8529509902000427, + "num_tokens": 315021812.0, + "step": 8260 + }, + { + "epoch": 1.050884111436204, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8592331409454346, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.858411967754364, + "num_tokens": 315059520.0, + "step": 8261 + }, + { + "epoch": 1.0510113217147945, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.019817352294922, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8565776944160461, + "num_tokens": 315098392.0, + "step": 8262 + }, + { + "epoch": 1.051138531993385, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.99459969997406, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8504163026809692, + "num_tokens": 315137375.0, + "step": 8263 + }, + { + "epoch": 1.0512657422719756, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.984896183013916, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8661524057388306, + "num_tokens": 315171308.0, + "step": 8264 + }, + { + "epoch": 1.051392952550566, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.035146474838257, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8631490468978882, + "num_tokens": 315212176.0, + "step": 8265 + }, + { + "epoch": 1.0515201628291566, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9510207176208496, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8503312468528748, + "num_tokens": 315251321.0, + "step": 8266 + }, + { + "epoch": 1.0516473731077471, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9540116786956787, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8468902111053467, + "num_tokens": 315288780.0, + "step": 8267 + }, + { + "epoch": 1.0517745833863377, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7614995241165161, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8837186098098755, + "num_tokens": 315325630.0, + "step": 8268 + }, + { + "epoch": 1.0519017936649282, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9568909406661987, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8562874794006348, + "num_tokens": 315362887.0, + "step": 8269 + }, + { + "epoch": 1.0520290039435187, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9587948322296143, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8742688894271851, + "num_tokens": 315397552.0, + "step": 8270 + }, + { + "epoch": 1.0521562142221093, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8126564025878906, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8722902536392212, + "num_tokens": 315436785.0, + "step": 8271 + }, + { + "epoch": 1.0522834245006996, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.949467420578003, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8711145520210266, + "num_tokens": 315467849.0, + "step": 8272 + }, + { + "epoch": 1.05241063477929, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 9.571187019348145, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8699603080749512, + "num_tokens": 315501496.0, + "step": 8273 + }, + { + "epoch": 1.0525378450578806, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9044512510299683, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8706142902374268, + "num_tokens": 315540878.0, + "step": 8274 + }, + { + "epoch": 1.0526650553364711, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9532420635223389, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.876428484916687, + "num_tokens": 315578139.0, + "step": 8275 + }, + { + "epoch": 1.0527922656150617, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9178733825683594, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8635965585708618, + "num_tokens": 315615845.0, + "step": 8276 + }, + { + "epoch": 1.0529194758936522, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9926027059555054, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8525500297546387, + "num_tokens": 315645407.0, + "step": 8277 + }, + { + "epoch": 1.0530466861722427, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8244136571884155, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8690770864486694, + "num_tokens": 315687922.0, + "step": 8278 + }, + { + "epoch": 1.0531738964508333, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.931429147720337, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8642597794532776, + "num_tokens": 315723730.0, + "step": 8279 + }, + { + "epoch": 1.0533011067294238, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.6894906759262085, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.874014139175415, + "num_tokens": 315765662.0, + "step": 8280 + }, + { + "epoch": 1.0534283170080143, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8576844930648804, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.837610125541687, + "num_tokens": 315808668.0, + "step": 8281 + }, + { + "epoch": 1.0535555272866048, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9220584630966187, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8579739332199097, + "num_tokens": 315844878.0, + "step": 8282 + }, + { + "epoch": 1.0536827375651954, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8724634647369385, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8758945465087891, + "num_tokens": 315879472.0, + "step": 8283 + }, + { + "epoch": 1.0538099478437857, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.920095682144165, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.857800304889679, + "num_tokens": 315915496.0, + "step": 8284 + }, + { + "epoch": 1.0539371581223762, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.796105146408081, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8632805943489075, + "num_tokens": 315954317.0, + "step": 8285 + }, + { + "epoch": 1.0540643684009667, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9899438619613647, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8570865392684937, + "num_tokens": 315995518.0, + "step": 8286 + }, + { + "epoch": 1.0541915786795573, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7018102407455444, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.876197874546051, + "num_tokens": 316040475.0, + "step": 8287 + }, + { + "epoch": 1.0543187889581478, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.801668643951416, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8604351282119751, + "num_tokens": 316083003.0, + "step": 8288 + }, + { + "epoch": 1.0544459992367383, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8396855592727661, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8635667562484741, + "num_tokens": 316124687.0, + "step": 8289 + }, + { + "epoch": 1.0545732095153288, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.826197624206543, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8494094610214233, + "num_tokens": 316170052.0, + "step": 8290 + }, + { + "epoch": 1.0547004197939194, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8632621765136719, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8484722375869751, + "num_tokens": 316211941.0, + "step": 8291 + }, + { + "epoch": 1.05482763007251, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8692638874053955, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8664302825927734, + "num_tokens": 316252592.0, + "step": 8292 + }, + { + "epoch": 1.0549548403511004, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8706815242767334, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8694900870323181, + "num_tokens": 316287250.0, + "step": 8293 + }, + { + "epoch": 1.055082050629691, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.788525938987732, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8659372925758362, + "num_tokens": 316326308.0, + "step": 8294 + }, + { + "epoch": 1.0552092609082815, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8638907670974731, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.865242063999176, + "num_tokens": 316364519.0, + "step": 8295 + }, + { + "epoch": 1.0553364711868718, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7649401426315308, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8681163191795349, + "num_tokens": 316401165.0, + "step": 8296 + }, + { + "epoch": 1.0554636814654623, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9299649000167847, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8575164079666138, + "num_tokens": 316440245.0, + "step": 8297 + }, + { + "epoch": 1.0555908917440529, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.784403681755066, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8714170455932617, + "num_tokens": 316484969.0, + "step": 8298 + }, + { + "epoch": 1.0557181020226434, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.837844967842102, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8797088265419006, + "num_tokens": 316520510.0, + "step": 8299 + }, + { + "epoch": 1.055845312301234, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8967503309249878, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8467002511024475, + "num_tokens": 316561946.0, + "step": 8300 + }, + { + "epoch": 1.0559725225798244, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7910557985305786, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8796287178993225, + "num_tokens": 316605489.0, + "step": 8301 + }, + { + "epoch": 1.056099732858415, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8131688833236694, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8465001583099365, + "num_tokens": 316648590.0, + "step": 8302 + }, + { + "epoch": 1.0562269431370055, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7996553182601929, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8745615482330322, + "num_tokens": 316688169.0, + "step": 8303 + }, + { + "epoch": 1.056354153415596, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.0352375507354736, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.842035174369812, + "num_tokens": 316724031.0, + "step": 8304 + }, + { + "epoch": 1.0564813636941865, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7709712982177734, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8611996173858643, + "num_tokens": 316769762.0, + "step": 8305 + }, + { + "epoch": 1.056608573972777, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.6532737016677856, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8700302243232727, + "num_tokens": 316814156.0, + "step": 8306 + }, + { + "epoch": 1.0567357842513676, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7175744771957397, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8717907667160034, + "num_tokens": 316852784.0, + "step": 8307 + }, + { + "epoch": 1.0568629945299581, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8007570505142212, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8566877841949463, + "num_tokens": 316891992.0, + "step": 8308 + }, + { + "epoch": 1.0569902048085484, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7613924741744995, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8763227462768555, + "num_tokens": 316932246.0, + "step": 8309 + }, + { + "epoch": 1.057117415087139, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7848201990127563, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8541992902755737, + "num_tokens": 316972017.0, + "step": 8310 + }, + { + "epoch": 1.0572446253657295, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9098610877990723, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.857431948184967, + "num_tokens": 317008631.0, + "step": 8311 + }, + { + "epoch": 1.05737183564432, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.9464703798294067, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8681930303573608, + "num_tokens": 317048616.0, + "step": 8312 + }, + { + "epoch": 1.0574990459229106, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.8946095705032349, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8551756143569946, + "num_tokens": 317088675.0, + "step": 8313 + }, + { + "epoch": 1.057626256201501, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8644696474075317, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8837075233459473, + "num_tokens": 317120351.0, + "step": 8314 + }, + { + "epoch": 1.0577534664800916, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.923609733581543, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8567744493484497, + "num_tokens": 317159027.0, + "step": 8315 + }, + { + "epoch": 1.0578806767586821, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.222871780395508, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8618963956832886, + "num_tokens": 317188892.0, + "step": 8316 + }, + { + "epoch": 1.0580078870372727, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.0529539585113525, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8672579526901245, + "num_tokens": 317229026.0, + "step": 8317 + }, + { + "epoch": 1.0581350973158632, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.797123908996582, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8614571690559387, + "num_tokens": 317272946.0, + "step": 8318 + }, + { + "epoch": 1.0582623075944537, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.6413769721984863, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8622978925704956, + "num_tokens": 317314625.0, + "step": 8319 + }, + { + "epoch": 1.058389517873044, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8443726301193237, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8716617226600647, + "num_tokens": 317351450.0, + "step": 8320 + }, + { + "epoch": 1.0585167281516346, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9135836362838745, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8463535308837891, + "num_tokens": 317391519.0, + "step": 8321 + }, + { + "epoch": 1.058643938430225, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8436781167984009, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8569234609603882, + "num_tokens": 317430672.0, + "step": 8322 + }, + { + "epoch": 1.0587711487088156, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.904593586921692, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8638785481452942, + "num_tokens": 317470203.0, + "step": 8323 + }, + { + "epoch": 1.0588983589874061, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.830202579498291, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8651299476623535, + "num_tokens": 317509594.0, + "step": 8324 + }, + { + "epoch": 1.0590255692659967, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9708452224731445, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8600912094116211, + "num_tokens": 317549735.0, + "step": 8325 + }, + { + "epoch": 1.0591527795445872, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.969765067100525, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8598526120185852, + "num_tokens": 317585064.0, + "step": 8326 + }, + { + "epoch": 1.0592799898231777, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8161345720291138, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8789894580841064, + "num_tokens": 317622289.0, + "step": 8327 + }, + { + "epoch": 1.0594072001017683, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8518120050430298, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8517657518386841, + "num_tokens": 317662872.0, + "step": 8328 + }, + { + "epoch": 1.0595344103803588, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.7993468046188354, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8632199168205261, + "num_tokens": 317701418.0, + "step": 8329 + }, + { + "epoch": 1.0596616206589493, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8600409030914307, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8753519058227539, + "num_tokens": 317741416.0, + "step": 8330 + }, + { + "epoch": 1.0597888309375398, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.1068499088287354, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8673568964004517, + "num_tokens": 317778623.0, + "step": 8331 + }, + { + "epoch": 1.0599160412161304, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9217135906219482, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.864606499671936, + "num_tokens": 317816577.0, + "step": 8332 + }, + { + "epoch": 1.0600432514947207, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.0249640941619873, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8622820377349854, + "num_tokens": 317851532.0, + "step": 8333 + }, + { + "epoch": 1.0601704617733112, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8751468658447266, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8625853657722473, + "num_tokens": 317886789.0, + "step": 8334 + }, + { + "epoch": 1.0602976720519017, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9613628387451172, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8642911314964294, + "num_tokens": 317929511.0, + "step": 8335 + }, + { + "epoch": 1.0604248823304923, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 9.578231811523438, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8606860637664795, + "num_tokens": 317963767.0, + "step": 8336 + }, + { + "epoch": 1.0605520926090828, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0408434867858887, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8791650533676147, + "num_tokens": 318000458.0, + "step": 8337 + }, + { + "epoch": 1.0606793028876733, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.857437014579773, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.879112958908081, + "num_tokens": 318040394.0, + "step": 8338 + }, + { + "epoch": 1.0608065131662638, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8494832515716553, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8589133024215698, + "num_tokens": 318080859.0, + "step": 8339 + }, + { + "epoch": 1.0609337234448544, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7665048837661743, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8784074187278748, + "num_tokens": 318115378.0, + "step": 8340 + }, + { + "epoch": 1.061060933723445, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9247585535049438, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8559759855270386, + "num_tokens": 318150351.0, + "step": 8341 + }, + { + "epoch": 1.0611881440020354, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.7694416046142578, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8627159595489502, + "num_tokens": 318192427.0, + "step": 8342 + }, + { + "epoch": 1.061315354280626, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8650175333023071, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8501721620559692, + "num_tokens": 318232150.0, + "step": 8343 + }, + { + "epoch": 1.0614425645592165, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.0592403411865234, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8581974506378174, + "num_tokens": 318267742.0, + "step": 8344 + }, + { + "epoch": 1.0615697748378068, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.870912790298462, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8661131858825684, + "num_tokens": 318307974.0, + "step": 8345 + }, + { + "epoch": 1.0616969851163973, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9194597005844116, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8454114198684692, + "num_tokens": 318349880.0, + "step": 8346 + }, + { + "epoch": 1.0618241953949878, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9333571195602417, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8610405921936035, + "num_tokens": 318383869.0, + "step": 8347 + }, + { + "epoch": 1.0619514056735784, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.823880910873413, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.878812313079834, + "num_tokens": 318419659.0, + "step": 8348 + }, + { + "epoch": 1.062078615952169, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8546189069747925, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8714452981948853, + "num_tokens": 318460795.0, + "step": 8349 + }, + { + "epoch": 1.0622058262307594, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.0616843700408936, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8527160882949829, + "num_tokens": 318499451.0, + "step": 8350 + }, + { + "epoch": 1.06233303650935, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8959959745407104, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8666249513626099, + "num_tokens": 318533413.0, + "step": 8351 + }, + { + "epoch": 1.0624602467879405, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9489161968231201, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8536244630813599, + "num_tokens": 318572533.0, + "step": 8352 + }, + { + "epoch": 1.062587457066531, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8837809562683105, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8664108514785767, + "num_tokens": 318609442.0, + "step": 8353 + }, + { + "epoch": 1.0627146673451215, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9431239366531372, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8766628503799438, + "num_tokens": 318643475.0, + "step": 8354 + }, + { + "epoch": 1.062841877623712, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9051494598388672, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8656978011131287, + "num_tokens": 318684223.0, + "step": 8355 + }, + { + "epoch": 1.0629690879023026, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.049337387084961, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8630658388137817, + "num_tokens": 318717148.0, + "step": 8356 + }, + { + "epoch": 1.0630962981808931, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.778525948524475, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8731368184089661, + "num_tokens": 318758929.0, + "step": 8357 + }, + { + "epoch": 1.0632235084594834, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9861122369766235, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8610159754753113, + "num_tokens": 318796438.0, + "step": 8358 + }, + { + "epoch": 1.063350718738074, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.0571131706237793, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8497518301010132, + "num_tokens": 318835732.0, + "step": 8359 + }, + { + "epoch": 1.0634779290166645, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.9236791133880615, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8615707159042358, + "num_tokens": 318872151.0, + "step": 8360 + }, + { + "epoch": 1.063605139295255, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.8207437992095947, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8516318798065186, + "num_tokens": 318911956.0, + "step": 8361 + }, + { + "epoch": 1.0637323495738455, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.917127013206482, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8637526631355286, + "num_tokens": 318945593.0, + "step": 8362 + }, + { + "epoch": 1.063859559852436, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.040415048599243, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.87297523021698, + "num_tokens": 318976670.0, + "step": 8363 + }, + { + "epoch": 1.0639867701310266, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0516819953918457, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8653351068496704, + "num_tokens": 319007798.0, + "step": 8364 + }, + { + "epoch": 1.0641139804096171, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9480856657028198, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8527140617370605, + "num_tokens": 319044808.0, + "step": 8365 + }, + { + "epoch": 1.0642411906882077, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7784634828567505, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8829540014266968, + "num_tokens": 319083387.0, + "step": 8366 + }, + { + "epoch": 1.0643684009667982, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8754656314849854, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8783971071243286, + "num_tokens": 319121897.0, + "step": 8367 + }, + { + "epoch": 1.0644956112453887, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9093095064163208, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8525494933128357, + "num_tokens": 319157190.0, + "step": 8368 + }, + { + "epoch": 1.064622821523979, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0239408016204834, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8725148439407349, + "num_tokens": 319188353.0, + "step": 8369 + }, + { + "epoch": 1.0647500318025696, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9435347318649292, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8657783269882202, + "num_tokens": 319225536.0, + "step": 8370 + }, + { + "epoch": 1.06487724208116, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8134655952453613, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8691704869270325, + "num_tokens": 319265114.0, + "step": 8371 + }, + { + "epoch": 1.0650044523597506, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7477192878723145, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8727900981903076, + "num_tokens": 319309280.0, + "step": 8372 + }, + { + "epoch": 1.0651316626383411, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.772455096244812, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8739266991615295, + "num_tokens": 319346114.0, + "step": 8373 + }, + { + "epoch": 1.0652588729169317, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7975348234176636, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8702346086502075, + "num_tokens": 319389001.0, + "step": 8374 + }, + { + "epoch": 1.0653860831955222, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8098032474517822, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8716570734977722, + "num_tokens": 319431464.0, + "step": 8375 + }, + { + "epoch": 1.0655132934741127, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8177502155303955, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8392861485481262, + "num_tokens": 319475638.0, + "step": 8376 + }, + { + "epoch": 1.0656405037527032, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8757661581039429, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8730176687240601, + "num_tokens": 319515983.0, + "step": 8377 + }, + { + "epoch": 1.0657677140312938, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0648205280303955, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8595744371414185, + "num_tokens": 319552152.0, + "step": 8378 + }, + { + "epoch": 1.0658949243098843, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8947854042053223, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8547182083129883, + "num_tokens": 319592963.0, + "step": 8379 + }, + { + "epoch": 1.0660221345884748, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9526801109313965, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8497426509857178, + "num_tokens": 319635136.0, + "step": 8380 + }, + { + "epoch": 1.0661493448670654, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9977974891662598, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8663065433502197, + "num_tokens": 319670118.0, + "step": 8381 + }, + { + "epoch": 1.0662765551456557, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.958938479423523, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.870008111000061, + "num_tokens": 319711057.0, + "step": 8382 + }, + { + "epoch": 1.0664037654242462, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9291807413101196, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.861624002456665, + "num_tokens": 319746451.0, + "step": 8383 + }, + { + "epoch": 1.0665309757028367, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8182605504989624, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8573019504547119, + "num_tokens": 319792149.0, + "step": 8384 + }, + { + "epoch": 1.0666581859814273, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.975081205368042, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8545788526535034, + "num_tokens": 319826508.0, + "step": 8385 + }, + { + "epoch": 1.0667853962600178, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8174927234649658, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8666545748710632, + "num_tokens": 319867000.0, + "step": 8386 + }, + { + "epoch": 1.0669126065386083, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0224649906158447, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8444899320602417, + "num_tokens": 319904476.0, + "step": 8387 + }, + { + "epoch": 1.0670398168171988, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.93926203250885, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8387665748596191, + "num_tokens": 319943279.0, + "step": 8388 + }, + { + "epoch": 1.0671670270957894, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.809293508529663, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8594895601272583, + "num_tokens": 319986610.0, + "step": 8389 + }, + { + "epoch": 1.06729423737438, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9405982494354248, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8568581342697144, + "num_tokens": 320022702.0, + "step": 8390 + }, + { + "epoch": 1.0674214476529704, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7975014448165894, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8664345741271973, + "num_tokens": 320067918.0, + "step": 8391 + }, + { + "epoch": 1.067548657931561, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.87344491481781, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8557132482528687, + "num_tokens": 320110455.0, + "step": 8392 + }, + { + "epoch": 1.0676758682101515, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8530561923980713, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8791314363479614, + "num_tokens": 320147675.0, + "step": 8393 + }, + { + "epoch": 1.0678030784887418, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.990452766418457, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8489789962768555, + "num_tokens": 320183882.0, + "step": 8394 + }, + { + "epoch": 1.0679302887673323, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9271345138549805, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8531230688095093, + "num_tokens": 320222215.0, + "step": 8395 + }, + { + "epoch": 1.0680574990459228, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0654046535491943, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8562270998954773, + "num_tokens": 320262418.0, + "step": 8396 + }, + { + "epoch": 1.0681847093245134, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9320672750473022, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8560178279876709, + "num_tokens": 320297304.0, + "step": 8397 + }, + { + "epoch": 1.068311919603104, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.920581340789795, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8656492233276367, + "num_tokens": 320330919.0, + "step": 8398 + }, + { + "epoch": 1.0684391298816944, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9652730226516724, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8633414506912231, + "num_tokens": 320367602.0, + "step": 8399 + }, + { + "epoch": 1.068566340160285, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8806818723678589, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8626697659492493, + "num_tokens": 320403875.0, + "step": 8400 + }, + { + "epoch": 1.0686935504388755, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1912031173706055, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8663975596427917, + "num_tokens": 320439149.0, + "step": 8401 + }, + { + "epoch": 1.068820760717466, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9294062852859497, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8746656179428101, + "num_tokens": 320476786.0, + "step": 8402 + }, + { + "epoch": 1.0689479709960565, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9457464218139648, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8475103378295898, + "num_tokens": 320516945.0, + "step": 8403 + }, + { + "epoch": 1.069075181274647, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8324538469314575, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8773119449615479, + "num_tokens": 320552801.0, + "step": 8404 + }, + { + "epoch": 1.0692023915532376, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9308762550354004, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8646673560142517, + "num_tokens": 320593605.0, + "step": 8405 + }, + { + "epoch": 1.0693296018318281, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0728116035461426, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8695127964019775, + "num_tokens": 320624476.0, + "step": 8406 + }, + { + "epoch": 1.0694568121104184, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7789565324783325, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.869782567024231, + "num_tokens": 320667000.0, + "step": 8407 + }, + { + "epoch": 1.069584022389009, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.048013925552368, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8684000968933105, + "num_tokens": 320699286.0, + "step": 8408 + }, + { + "epoch": 1.0697112326675995, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.8204593658447266, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8696605563163757, + "num_tokens": 320732520.0, + "step": 8409 + }, + { + "epoch": 1.06983844294619, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9109230041503906, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8516321182250977, + "num_tokens": 320773425.0, + "step": 8410 + }, + { + "epoch": 1.0699656532247805, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9338618516921997, + "learning_rate": 1e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8402959108352661, + "num_tokens": 320810859.0, + "step": 8411 + }, + { + "epoch": 1.070092863503371, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.748699426651001, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8666930198669434, + "num_tokens": 320850266.0, + "step": 8412 + }, + { + "epoch": 1.0702200737819616, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8145517110824585, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8612415790557861, + "num_tokens": 320892812.0, + "step": 8413 + }, + { + "epoch": 1.0703472840605521, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8634365797042847, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.874630868434906, + "num_tokens": 320929563.0, + "step": 8414 + }, + { + "epoch": 1.0704744943391427, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.942318320274353, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8699253797531128, + "num_tokens": 320964010.0, + "step": 8415 + }, + { + "epoch": 1.0706017046177332, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7809147834777832, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8596906661987305, + "num_tokens": 321006405.0, + "step": 8416 + }, + { + "epoch": 1.0707289148963237, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.885270357131958, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8579843044281006, + "num_tokens": 321050146.0, + "step": 8417 + }, + { + "epoch": 1.070856125174914, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9322608709335327, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8626747727394104, + "num_tokens": 321086857.0, + "step": 8418 + }, + { + "epoch": 1.0709833354535045, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8208115100860596, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8689635992050171, + "num_tokens": 321127310.0, + "step": 8419 + }, + { + "epoch": 1.071110545732095, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9932211637496948, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8536287546157837, + "num_tokens": 321165881.0, + "step": 8420 + }, + { + "epoch": 1.0712377560106856, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.856940507888794, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8566955327987671, + "num_tokens": 321204494.0, + "step": 8421 + }, + { + "epoch": 1.0713649662892761, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8952234983444214, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8521502017974854, + "num_tokens": 321243639.0, + "step": 8422 + }, + { + "epoch": 1.0714921765678667, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.1296627521514893, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8535028696060181, + "num_tokens": 321281399.0, + "step": 8423 + }, + { + "epoch": 1.0716193868464572, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0036725997924805, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8502026200294495, + "num_tokens": 321316539.0, + "step": 8424 + }, + { + "epoch": 1.0717465971250477, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.211320400238037, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.858242392539978, + "num_tokens": 321348618.0, + "step": 8425 + }, + { + "epoch": 1.0718738074036382, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9700689315795898, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8675369620323181, + "num_tokens": 321381674.0, + "step": 8426 + }, + { + "epoch": 1.0720010176822288, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8009206056594849, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8779836297035217, + "num_tokens": 321420342.0, + "step": 8427 + }, + { + "epoch": 1.0721282279608193, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.05787992477417, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.860039234161377, + "num_tokens": 321450770.0, + "step": 8428 + }, + { + "epoch": 1.0722554382394098, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8967283964157104, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8656501173973083, + "num_tokens": 321491450.0, + "step": 8429 + }, + { + "epoch": 1.0723826485180004, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8772391080856323, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8557326793670654, + "num_tokens": 321531774.0, + "step": 8430 + }, + { + "epoch": 1.0725098587965907, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9453202486038208, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8626935482025146, + "num_tokens": 321567692.0, + "step": 8431 + }, + { + "epoch": 1.0726370690751812, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.098883628845215, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.858506441116333, + "num_tokens": 321601168.0, + "step": 8432 + }, + { + "epoch": 1.0727642793537717, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.011975049972534, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8432557582855225, + "num_tokens": 321636100.0, + "step": 8433 + }, + { + "epoch": 1.0728914896323622, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8239113092422485, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8621866703033447, + "num_tokens": 321674365.0, + "step": 8434 + }, + { + "epoch": 1.0730186999109528, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8256219625473022, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8729814291000366, + "num_tokens": 321712267.0, + "step": 8435 + }, + { + "epoch": 1.0731459101895433, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.103830575942993, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8602672219276428, + "num_tokens": 321749955.0, + "step": 8436 + }, + { + "epoch": 1.0732731204681338, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9500452280044556, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8702903389930725, + "num_tokens": 321788522.0, + "step": 8437 + }, + { + "epoch": 1.0734003307467244, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.932135820388794, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8592886924743652, + "num_tokens": 321826314.0, + "step": 8438 + }, + { + "epoch": 1.073527541025315, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9150879383087158, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8579219579696655, + "num_tokens": 321864851.0, + "step": 8439 + }, + { + "epoch": 1.0736547513039054, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9633071422576904, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8718099594116211, + "num_tokens": 321901001.0, + "step": 8440 + }, + { + "epoch": 1.073781961582496, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8920657634735107, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8597509860992432, + "num_tokens": 321941368.0, + "step": 8441 + }, + { + "epoch": 1.0739091718610865, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.030592679977417, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8657771944999695, + "num_tokens": 321978268.0, + "step": 8442 + }, + { + "epoch": 1.0740363821396768, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8077493906021118, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8699734210968018, + "num_tokens": 322016969.0, + "step": 8443 + }, + { + "epoch": 1.0741635924182673, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9122114181518555, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8550310134887695, + "num_tokens": 322059362.0, + "step": 8444 + }, + { + "epoch": 1.0742908026968578, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9432651996612549, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8428888916969299, + "num_tokens": 322097334.0, + "step": 8445 + }, + { + "epoch": 1.0744180129754484, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0018129348754883, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8820917010307312, + "num_tokens": 322121988.0, + "step": 8446 + }, + { + "epoch": 1.074545223254039, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1927053928375244, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8563989400863647, + "num_tokens": 322156602.0, + "step": 8447 + }, + { + "epoch": 1.0746724335326294, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9592504501342773, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8594219088554382, + "num_tokens": 322194475.0, + "step": 8448 + }, + { + "epoch": 1.07479964381122, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9318684339523315, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8716918230056763, + "num_tokens": 322232158.0, + "step": 8449 + }, + { + "epoch": 1.0749268540898105, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8057197332382202, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8723956346511841, + "num_tokens": 322272170.0, + "step": 8450 + }, + { + "epoch": 1.075054064368401, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.100470542907715, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8674469590187073, + "num_tokens": 322305496.0, + "step": 8451 + }, + { + "epoch": 1.0751812746469915, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7461689710617065, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8668967485427856, + "num_tokens": 322347552.0, + "step": 8452 + }, + { + "epoch": 1.075308484925582, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8653734922409058, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8661627769470215, + "num_tokens": 322386471.0, + "step": 8453 + }, + { + "epoch": 1.0754356952041726, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.871455430984497, + "learning_rate": 1e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8282605409622192, + "num_tokens": 322431019.0, + "step": 8454 + }, + { + "epoch": 1.0755629054827631, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7191014289855957, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8817721605300903, + "num_tokens": 322471525.0, + "step": 8455 + }, + { + "epoch": 1.0756901157613534, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8755522966384888, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8650698661804199, + "num_tokens": 322516594.0, + "step": 8456 + }, + { + "epoch": 1.075817326039944, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8635656833648682, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.869260311126709, + "num_tokens": 322557837.0, + "step": 8457 + }, + { + "epoch": 1.0759445363185345, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9454035758972168, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8580148220062256, + "num_tokens": 322594165.0, + "step": 8458 + }, + { + "epoch": 1.076071746597125, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8874720335006714, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8448604941368103, + "num_tokens": 322637083.0, + "step": 8459 + }, + { + "epoch": 1.0761989568757155, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9082869291305542, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8606582880020142, + "num_tokens": 322673216.0, + "step": 8460 + }, + { + "epoch": 1.076326167154306, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8642789125442505, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8658110499382019, + "num_tokens": 322709150.0, + "step": 8461 + }, + { + "epoch": 1.0764533774328966, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8435474634170532, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.869236946105957, + "num_tokens": 322747893.0, + "step": 8462 + }, + { + "epoch": 1.0765805877114871, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0886144638061523, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8460965156555176, + "num_tokens": 322781667.0, + "step": 8463 + }, + { + "epoch": 1.0767077979900777, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9260708093643188, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8616925477981567, + "num_tokens": 322818008.0, + "step": 8464 + }, + { + "epoch": 1.0768350082686682, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9123780727386475, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8422985076904297, + "num_tokens": 322856528.0, + "step": 8465 + }, + { + "epoch": 1.0769622185472587, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.2000794410705566, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8651160001754761, + "num_tokens": 322888798.0, + "step": 8466 + }, + { + "epoch": 1.077089428825849, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9568979740142822, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8636698722839355, + "num_tokens": 322923301.0, + "step": 8467 + }, + { + "epoch": 1.0772166391044395, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8036612272262573, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8743914365768433, + "num_tokens": 322964253.0, + "step": 8468 + }, + { + "epoch": 1.07734384938303, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7971582412719727, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8609826564788818, + "num_tokens": 323006926.0, + "step": 8469 + }, + { + "epoch": 1.0774710596616206, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.09279727935791, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8747826814651489, + "num_tokens": 323047541.0, + "step": 8470 + }, + { + "epoch": 1.0775982699402111, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1042327880859375, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8541196584701538, + "num_tokens": 323084272.0, + "step": 8471 + }, + { + "epoch": 1.0777254802188017, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.029907464981079, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8741203546524048, + "num_tokens": 323116698.0, + "step": 8472 + }, + { + "epoch": 1.0778526904973922, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.730258822441101, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8693671226501465, + "num_tokens": 323158836.0, + "step": 8473 + }, + { + "epoch": 1.0779799007759827, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0419349670410156, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8593600392341614, + "num_tokens": 323193777.0, + "step": 8474 + }, + { + "epoch": 1.0781071110545732, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8728142976760864, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8634769916534424, + "num_tokens": 323232546.0, + "step": 8475 + }, + { + "epoch": 1.0782343213331638, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.900219202041626, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8623366951942444, + "num_tokens": 323269403.0, + "step": 8476 + }, + { + "epoch": 1.0783615316117543, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8303723335266113, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8757367134094238, + "num_tokens": 323305982.0, + "step": 8477 + }, + { + "epoch": 1.0784887418903448, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9837217330932617, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8579615950584412, + "num_tokens": 323341032.0, + "step": 8478 + }, + { + "epoch": 1.0786159521689354, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9053609371185303, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8573060035705566, + "num_tokens": 323381647.0, + "step": 8479 + }, + { + "epoch": 1.0787431624475257, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9050462245941162, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8676842451095581, + "num_tokens": 323417185.0, + "step": 8480 + }, + { + "epoch": 1.0788703727261162, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9212143421173096, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8630067110061646, + "num_tokens": 323454624.0, + "step": 8481 + }, + { + "epoch": 1.0789975830047067, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8685588836669922, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8623286485671997, + "num_tokens": 323492862.0, + "step": 8482 + }, + { + "epoch": 1.0791247932832972, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9379678964614868, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8593487739562988, + "num_tokens": 323536012.0, + "step": 8483 + }, + { + "epoch": 1.0792520035618878, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.84991455078125, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8677871227264404, + "num_tokens": 323573495.0, + "step": 8484 + }, + { + "epoch": 1.0793792138404783, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8536193370819092, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8729565143585205, + "num_tokens": 323610900.0, + "step": 8485 + }, + { + "epoch": 1.0795064241190688, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7952990531921387, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8605095148086548, + "num_tokens": 323650983.0, + "step": 8486 + }, + { + "epoch": 1.0796336343976594, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.894612431526184, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8551145195960999, + "num_tokens": 323690155.0, + "step": 8487 + }, + { + "epoch": 1.0797608446762499, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8553248643875122, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8766053915023804, + "num_tokens": 323732212.0, + "step": 8488 + }, + { + "epoch": 1.0798880549548404, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8473498821258545, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8714539408683777, + "num_tokens": 323773400.0, + "step": 8489 + }, + { + "epoch": 1.080015265233431, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9450098276138306, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8697144985198975, + "num_tokens": 323812753.0, + "step": 8490 + }, + { + "epoch": 1.0801424755120215, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8432985544204712, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8648450970649719, + "num_tokens": 323855835.0, + "step": 8491 + }, + { + "epoch": 1.0802696857906118, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7455934286117554, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8759232759475708, + "num_tokens": 323896065.0, + "step": 8492 + }, + { + "epoch": 1.0803968960692023, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9126266241073608, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8656466007232666, + "num_tokens": 323934351.0, + "step": 8493 + }, + { + "epoch": 1.0805241063477928, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9904847145080566, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8523775935173035, + "num_tokens": 323968878.0, + "step": 8494 + }, + { + "epoch": 1.0806513166263834, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9232369661331177, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8626320362091064, + "num_tokens": 324004838.0, + "step": 8495 + }, + { + "epoch": 1.080778526904974, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0286307334899902, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8788704872131348, + "num_tokens": 324040851.0, + "step": 8496 + }, + { + "epoch": 1.0809057371835644, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8507550954818726, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8509894013404846, + "num_tokens": 324084675.0, + "step": 8497 + }, + { + "epoch": 1.081032947462155, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8530950546264648, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8681329488754272, + "num_tokens": 324122136.0, + "step": 8498 + }, + { + "epoch": 1.0811601577407455, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9994308948516846, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8621875047683716, + "num_tokens": 324153142.0, + "step": 8499 + }, + { + "epoch": 1.081287368019336, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.117626190185547, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8445302248001099, + "num_tokens": 324187244.0, + "step": 8500 + }, + { + "epoch": 1.0814145782979265, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9170458316802979, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8650340437889099, + "num_tokens": 324225644.0, + "step": 8501 + }, + { + "epoch": 1.081541788576517, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7295691967010498, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8705905079841614, + "num_tokens": 324270394.0, + "step": 8502 + }, + { + "epoch": 1.0816689988551076, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7696759700775146, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8744862079620361, + "num_tokens": 324311244.0, + "step": 8503 + }, + { + "epoch": 1.0817962091336981, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9225730895996094, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8443169593811035, + "num_tokens": 324350821.0, + "step": 8504 + }, + { + "epoch": 1.0819234194122884, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.852452039718628, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8621718287467957, + "num_tokens": 324388255.0, + "step": 8505 + }, + { + "epoch": 1.082050629690879, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8936048746109009, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.837522029876709, + "num_tokens": 324430068.0, + "step": 8506 + }, + { + "epoch": 1.0821778399694695, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8999642133712769, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8683586120605469, + "num_tokens": 324465341.0, + "step": 8507 + }, + { + "epoch": 1.08230505024806, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.11431622505188, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8697723150253296, + "num_tokens": 324502199.0, + "step": 8508 + }, + { + "epoch": 1.0824322605266505, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.052882671356201, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8648110628128052, + "num_tokens": 324535898.0, + "step": 8509 + }, + { + "epoch": 1.082559470805241, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0565264225006104, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8520742654800415, + "num_tokens": 324574404.0, + "step": 8510 + }, + { + "epoch": 1.0826866810838316, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8733694553375244, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.861994743347168, + "num_tokens": 324611743.0, + "step": 8511 + }, + { + "epoch": 1.0828138913624221, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8640762567520142, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8534947633743286, + "num_tokens": 324653126.0, + "step": 8512 + }, + { + "epoch": 1.0829411016410126, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.872240424156189, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8702216148376465, + "num_tokens": 324692957.0, + "step": 8513 + }, + { + "epoch": 1.0830683119196032, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.06337308883667, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8579962253570557, + "num_tokens": 324737214.0, + "step": 8514 + }, + { + "epoch": 1.0831955221981937, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.5226268768310547, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8736342191696167, + "num_tokens": 324773098.0, + "step": 8515 + }, + { + "epoch": 1.083322732476784, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8424274921417236, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8517113924026489, + "num_tokens": 324815846.0, + "step": 8516 + }, + { + "epoch": 1.0834499427553745, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8352934122085571, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8597966432571411, + "num_tokens": 324855912.0, + "step": 8517 + }, + { + "epoch": 1.083577153033965, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9924322366714478, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8564044237136841, + "num_tokens": 324893090.0, + "step": 8518 + }, + { + "epoch": 1.0837043633125556, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9760475158691406, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.86338871717453, + "num_tokens": 324929779.0, + "step": 8519 + }, + { + "epoch": 1.0838315735911461, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0282516479492188, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8676909804344177, + "num_tokens": 324961349.0, + "step": 8520 + }, + { + "epoch": 1.0839587838697367, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8754152059555054, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8635907769203186, + "num_tokens": 324998688.0, + "step": 8521 + }, + { + "epoch": 1.0840859941483272, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8890161514282227, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.855920135974884, + "num_tokens": 325040601.0, + "step": 8522 + }, + { + "epoch": 1.0842132044269177, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7677342891693115, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8663533329963684, + "num_tokens": 325081729.0, + "step": 8523 + }, + { + "epoch": 1.0843404147055082, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9923073053359985, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8527747988700867, + "num_tokens": 325118886.0, + "step": 8524 + }, + { + "epoch": 1.0844676249840988, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.050042152404785, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8808759450912476, + "num_tokens": 325156417.0, + "step": 8525 + }, + { + "epoch": 1.0845948352626893, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7692813873291016, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8739073276519775, + "num_tokens": 325200902.0, + "step": 8526 + }, + { + "epoch": 1.0847220455412798, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.938725471496582, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8646668195724487, + "num_tokens": 325233362.0, + "step": 8527 + }, + { + "epoch": 1.0848492558198704, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7741615772247314, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8621522188186646, + "num_tokens": 325274646.0, + "step": 8528 + }, + { + "epoch": 1.0849764660984607, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.088864326477051, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8784989714622498, + "num_tokens": 325304735.0, + "step": 8529 + }, + { + "epoch": 1.0851036763770512, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9696871042251587, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8667541742324829, + "num_tokens": 325339833.0, + "step": 8530 + }, + { + "epoch": 1.0852308866556417, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0143115520477295, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8513988256454468, + "num_tokens": 325373838.0, + "step": 8531 + }, + { + "epoch": 1.0853580969342322, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.912832498550415, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8835275173187256, + "num_tokens": 325412913.0, + "step": 8532 + }, + { + "epoch": 1.0854853072128228, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.996023178100586, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8690109848976135, + "num_tokens": 325451830.0, + "step": 8533 + }, + { + "epoch": 1.0856125174914133, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.04483962059021, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.845510721206665, + "num_tokens": 325487162.0, + "step": 8534 + }, + { + "epoch": 1.0857397277700038, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0045485496520996, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8733004927635193, + "num_tokens": 325525906.0, + "step": 8535 + }, + { + "epoch": 1.0858669380485944, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8538793325424194, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8745452165603638, + "num_tokens": 325559925.0, + "step": 8536 + }, + { + "epoch": 1.0859941483271849, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8176909685134888, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8621145486831665, + "num_tokens": 325599980.0, + "step": 8537 + }, + { + "epoch": 1.0861213586057754, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.4876291751861572, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8463528156280518, + "num_tokens": 325631745.0, + "step": 8538 + }, + { + "epoch": 1.086248568884366, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9206010103225708, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8587868809700012, + "num_tokens": 325673583.0, + "step": 8539 + }, + { + "epoch": 1.0863757791629565, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9090417623519897, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8559044003486633, + "num_tokens": 325712995.0, + "step": 8540 + }, + { + "epoch": 1.0865029894415468, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8827219009399414, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8678994178771973, + "num_tokens": 325748259.0, + "step": 8541 + }, + { + "epoch": 1.0866301997201373, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9210131168365479, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8691374063491821, + "num_tokens": 325787676.0, + "step": 8542 + }, + { + "epoch": 1.0867574099987278, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.869908332824707, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8383004665374756, + "num_tokens": 325827819.0, + "step": 8543 + }, + { + "epoch": 1.0868846202773184, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.947668433189392, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8719557523727417, + "num_tokens": 325866328.0, + "step": 8544 + }, + { + "epoch": 1.0870118305559089, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8725665807724, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8582797050476074, + "num_tokens": 325906752.0, + "step": 8545 + }, + { + "epoch": 1.0871390408344994, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9795374870300293, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8564037084579468, + "num_tokens": 325943765.0, + "step": 8546 + }, + { + "epoch": 1.08726625111309, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.266632556915283, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8698167204856873, + "num_tokens": 325981471.0, + "step": 8547 + }, + { + "epoch": 1.0873934613916805, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.012932777404785, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8686419129371643, + "num_tokens": 326017253.0, + "step": 8548 + }, + { + "epoch": 1.087520671670271, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.93666672706604, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8638409972190857, + "num_tokens": 326053484.0, + "step": 8549 + }, + { + "epoch": 1.0876478819488615, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1191036701202393, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8690351843833923, + "num_tokens": 326084779.0, + "step": 8550 + }, + { + "epoch": 1.087775092227452, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9155499935150146, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8611983060836792, + "num_tokens": 326120795.0, + "step": 8551 + }, + { + "epoch": 1.0879023025060426, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.991396188735962, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8604822158813477, + "num_tokens": 326155991.0, + "step": 8552 + }, + { + "epoch": 1.0880295127846331, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.895974040031433, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8582968711853027, + "num_tokens": 326195738.0, + "step": 8553 + }, + { + "epoch": 1.0881567230632234, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9438276290893555, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8634690642356873, + "num_tokens": 326237773.0, + "step": 8554 + }, + { + "epoch": 1.088283933341814, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8790866136550903, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8767669796943665, + "num_tokens": 326278589.0, + "step": 8555 + }, + { + "epoch": 1.0884111436204045, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8703491687774658, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8561687469482422, + "num_tokens": 326316862.0, + "step": 8556 + }, + { + "epoch": 1.088538353898995, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9857197999954224, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8600986003875732, + "num_tokens": 326353892.0, + "step": 8557 + }, + { + "epoch": 1.0886655641775855, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8699418306350708, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8687256574630737, + "num_tokens": 326389478.0, + "step": 8558 + }, + { + "epoch": 1.088792774456176, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8757164478302002, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8576615452766418, + "num_tokens": 326428072.0, + "step": 8559 + }, + { + "epoch": 1.0889199847347666, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9527833461761475, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.866897702217102, + "num_tokens": 326466374.0, + "step": 8560 + }, + { + "epoch": 1.0890471950133571, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.926825761795044, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8599698543548584, + "num_tokens": 326501081.0, + "step": 8561 + }, + { + "epoch": 1.0891744052919476, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0488240718841553, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8712267279624939, + "num_tokens": 326532539.0, + "step": 8562 + }, + { + "epoch": 1.0893016155705382, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0275683403015137, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8687623739242554, + "num_tokens": 326569481.0, + "step": 8563 + }, + { + "epoch": 1.0894288258491287, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8732396364212036, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8608251810073853, + "num_tokens": 326609746.0, + "step": 8564 + }, + { + "epoch": 1.089556036127719, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9739699363708496, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8527688384056091, + "num_tokens": 326652379.0, + "step": 8565 + }, + { + "epoch": 1.0896832464063095, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0005736351013184, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8585370779037476, + "num_tokens": 326687617.0, + "step": 8566 + }, + { + "epoch": 1.0898104566849, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9420121908187866, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8548895120620728, + "num_tokens": 326724155.0, + "step": 8567 + }, + { + "epoch": 1.0899376669634906, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8105590343475342, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8615595698356628, + "num_tokens": 326768234.0, + "step": 8568 + }, + { + "epoch": 1.0900648772420811, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8406624794006348, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8679471015930176, + "num_tokens": 326810423.0, + "step": 8569 + }, + { + "epoch": 1.0901920875206716, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7870579957962036, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8597899675369263, + "num_tokens": 326851834.0, + "step": 8570 + }, + { + "epoch": 1.0903192977992622, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.886796236038208, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8497087955474854, + "num_tokens": 326889334.0, + "step": 8571 + }, + { + "epoch": 1.0904465080778527, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.884178876876831, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8504924774169922, + "num_tokens": 326931884.0, + "step": 8572 + }, + { + "epoch": 1.0905737183564432, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.060145378112793, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8702232837677002, + "num_tokens": 326969043.0, + "step": 8573 + }, + { + "epoch": 1.0907009286350338, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9956474304199219, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8635643124580383, + "num_tokens": 326998369.0, + "step": 8574 + }, + { + "epoch": 1.0908281389136243, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9220658540725708, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8616254925727844, + "num_tokens": 327036909.0, + "step": 8575 + }, + { + "epoch": 1.0909553491922148, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8983794450759888, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8552254438400269, + "num_tokens": 327076053.0, + "step": 8576 + }, + { + "epoch": 1.0910825594708053, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.15557861328125, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8701038360595703, + "num_tokens": 327112285.0, + "step": 8577 + }, + { + "epoch": 1.0912097697493957, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8137661218643188, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8684119582176208, + "num_tokens": 327151591.0, + "step": 8578 + }, + { + "epoch": 1.0913369800279862, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.870324969291687, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8479192852973938, + "num_tokens": 327193600.0, + "step": 8579 + }, + { + "epoch": 1.0914641903065767, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.932865858078003, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8526666164398193, + "num_tokens": 327230249.0, + "step": 8580 + }, + { + "epoch": 1.0915914005851672, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8113476037979126, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8756751418113708, + "num_tokens": 327270043.0, + "step": 8581 + }, + { + "epoch": 1.0917186108637578, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.300501585006714, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8642001152038574, + "num_tokens": 327309396.0, + "step": 8582 + }, + { + "epoch": 1.0918458211423483, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.792624592781067, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8697792887687683, + "num_tokens": 327349497.0, + "step": 8583 + }, + { + "epoch": 1.0919730314209388, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9101014137268066, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8548035621643066, + "num_tokens": 327385648.0, + "step": 8584 + }, + { + "epoch": 1.0921002416995294, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9881844520568848, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.864167332649231, + "num_tokens": 327426379.0, + "step": 8585 + }, + { + "epoch": 1.0922274519781199, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8742281198501587, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8682079911231995, + "num_tokens": 327467555.0, + "step": 8586 + }, + { + "epoch": 1.0923546622567104, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9362516403198242, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8595436811447144, + "num_tokens": 327505303.0, + "step": 8587 + }, + { + "epoch": 1.092481872535301, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8431506156921387, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8691286444664001, + "num_tokens": 327547854.0, + "step": 8588 + }, + { + "epoch": 1.0926090828138915, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9224225282669067, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8647797703742981, + "num_tokens": 327584132.0, + "step": 8589 + }, + { + "epoch": 1.0927362930924818, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8124147653579712, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8506934642791748, + "num_tokens": 327629250.0, + "step": 8590 + }, + { + "epoch": 1.0928635033710723, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9486769437789917, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8552287817001343, + "num_tokens": 327666988.0, + "step": 8591 + }, + { + "epoch": 1.0929907136496628, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8703584671020508, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8641493320465088, + "num_tokens": 327705955.0, + "step": 8592 + }, + { + "epoch": 1.0931179239282534, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.2728874683380127, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8547165989875793, + "num_tokens": 327746362.0, + "step": 8593 + }, + { + "epoch": 1.0932451342068439, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9057313203811646, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8632030487060547, + "num_tokens": 327789034.0, + "step": 8594 + }, + { + "epoch": 1.0933723444854344, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.053553342819214, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8518348932266235, + "num_tokens": 327824203.0, + "step": 8595 + }, + { + "epoch": 1.093499554764025, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9798169136047363, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8629517555236816, + "num_tokens": 327860340.0, + "step": 8596 + }, + { + "epoch": 1.0936267650426155, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.85429048538208, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8543839454650879, + "num_tokens": 327901709.0, + "step": 8597 + }, + { + "epoch": 1.093753975321206, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8569234609603882, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.863640308380127, + "num_tokens": 327940327.0, + "step": 8598 + }, + { + "epoch": 1.0938811855997965, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8588675260543823, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8676669001579285, + "num_tokens": 327977966.0, + "step": 8599 + }, + { + "epoch": 1.094008395878387, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7843406200408936, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8583847880363464, + "num_tokens": 328021143.0, + "step": 8600 + }, + { + "epoch": 1.0941356061569776, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8225202560424805, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8686913251876831, + "num_tokens": 328060792.0, + "step": 8601 + }, + { + "epoch": 1.094262816435568, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.849605679512024, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8638302087783813, + "num_tokens": 328099698.0, + "step": 8602 + }, + { + "epoch": 1.0943900267141584, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8019644021987915, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8648148775100708, + "num_tokens": 328141075.0, + "step": 8603 + }, + { + "epoch": 1.094517236992749, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8853375911712646, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8553701639175415, + "num_tokens": 328178945.0, + "step": 8604 + }, + { + "epoch": 1.0946444472713395, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0083508491516113, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8362019062042236, + "num_tokens": 328214423.0, + "step": 8605 + }, + { + "epoch": 1.09477165754993, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0080008506774902, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8627139925956726, + "num_tokens": 328248640.0, + "step": 8606 + }, + { + "epoch": 1.0948988678285205, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.074012517929077, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8409236669540405, + "num_tokens": 328283016.0, + "step": 8607 + }, + { + "epoch": 1.095026078107111, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.937068223953247, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8666034936904907, + "num_tokens": 328322377.0, + "step": 8608 + }, + { + "epoch": 1.0951532883857016, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.016111373901367, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8637305498123169, + "num_tokens": 328359409.0, + "step": 8609 + }, + { + "epoch": 1.0952804986642921, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8939658403396606, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8666257858276367, + "num_tokens": 328401257.0, + "step": 8610 + }, + { + "epoch": 1.0954077089428826, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9206492900848389, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8650270104408264, + "num_tokens": 328435886.0, + "step": 8611 + }, + { + "epoch": 1.0955349192214732, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.006488800048828, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8525129556655884, + "num_tokens": 328472091.0, + "step": 8612 + }, + { + "epoch": 1.0956621295000637, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9786832332611084, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.85454261302948, + "num_tokens": 328509453.0, + "step": 8613 + }, + { + "epoch": 1.095789339778654, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9790172576904297, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8627550005912781, + "num_tokens": 328545590.0, + "step": 8614 + }, + { + "epoch": 1.0959165500572445, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.037440299987793, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8522312045097351, + "num_tokens": 328583093.0, + "step": 8615 + }, + { + "epoch": 1.096043760335835, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.71567964553833, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8453220129013062, + "num_tokens": 328625312.0, + "step": 8616 + }, + { + "epoch": 1.0961709706144256, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.066312789916992, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8555347323417664, + "num_tokens": 328655392.0, + "step": 8617 + }, + { + "epoch": 1.0962981808930161, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8984965085983276, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.846869945526123, + "num_tokens": 328693686.0, + "step": 8618 + }, + { + "epoch": 1.0964253911716066, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9086934328079224, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8728073835372925, + "num_tokens": 328728051.0, + "step": 8619 + }, + { + "epoch": 1.0965526014501972, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8379307985305786, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8513901829719543, + "num_tokens": 328768091.0, + "step": 8620 + }, + { + "epoch": 1.0966798117287877, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.304365634918213, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8590115308761597, + "num_tokens": 328803851.0, + "step": 8621 + }, + { + "epoch": 1.0968070220073782, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.921217441558838, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8726546764373779, + "num_tokens": 328837429.0, + "step": 8622 + }, + { + "epoch": 1.0969342322859688, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8910090923309326, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8609744310379028, + "num_tokens": 328874264.0, + "step": 8623 + }, + { + "epoch": 1.0970614425645593, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9204797744750977, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8654392957687378, + "num_tokens": 328911627.0, + "step": 8624 + }, + { + "epoch": 1.0971886528431498, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.235109806060791, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8550530076026917, + "num_tokens": 328944471.0, + "step": 8625 + }, + { + "epoch": 1.0973158631217403, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.761851191520691, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8706158399581909, + "num_tokens": 328983826.0, + "step": 8626 + }, + { + "epoch": 1.0974430734003306, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.19095778465271, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8637988567352295, + "num_tokens": 329019385.0, + "step": 8627 + }, + { + "epoch": 1.0975702836789212, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8989381790161133, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8500791788101196, + "num_tokens": 329065179.0, + "step": 8628 + }, + { + "epoch": 1.0976974939575117, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9089787006378174, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8697981238365173, + "num_tokens": 329101077.0, + "step": 8629 + }, + { + "epoch": 1.0978247042361022, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.3539228439331055, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8664093017578125, + "num_tokens": 329137538.0, + "step": 8630 + }, + { + "epoch": 1.0979519145146928, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9657117128372192, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8757990598678589, + "num_tokens": 329171952.0, + "step": 8631 + }, + { + "epoch": 1.0980791247932833, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8456369638442993, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8701237440109253, + "num_tokens": 329214956.0, + "step": 8632 + }, + { + "epoch": 1.0982063350718738, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.4992892742156982, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8520225286483765, + "num_tokens": 329254228.0, + "step": 8633 + }, + { + "epoch": 1.0983335453504643, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.7065045833587646, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8635631203651428, + "num_tokens": 329288337.0, + "step": 8634 + }, + { + "epoch": 1.0984607556290549, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9878053665161133, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8555300235748291, + "num_tokens": 329329118.0, + "step": 8635 + }, + { + "epoch": 1.0985879659076454, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8789186477661133, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8474934697151184, + "num_tokens": 329366301.0, + "step": 8636 + }, + { + "epoch": 1.098715176186236, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1369378566741943, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8498123288154602, + "num_tokens": 329401612.0, + "step": 8637 + }, + { + "epoch": 1.0988423864648265, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9780926704406738, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8635636568069458, + "num_tokens": 329440403.0, + "step": 8638 + }, + { + "epoch": 1.0989695967434168, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8937327861785889, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8722289800643921, + "num_tokens": 329478699.0, + "step": 8639 + }, + { + "epoch": 1.0990968070220073, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9008389711380005, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.861396312713623, + "num_tokens": 329517860.0, + "step": 8640 + }, + { + "epoch": 1.0992240173005978, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9142515659332275, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8678721785545349, + "num_tokens": 329555385.0, + "step": 8641 + }, + { + "epoch": 1.0993512275791884, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9245054721832275, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8597400784492493, + "num_tokens": 329594775.0, + "step": 8642 + }, + { + "epoch": 1.0994784378577789, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9847832918167114, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8665939569473267, + "num_tokens": 329629872.0, + "step": 8643 + }, + { + "epoch": 1.0996056481363694, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9763590097427368, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8569763898849487, + "num_tokens": 329665263.0, + "step": 8644 + }, + { + "epoch": 1.09973285841496, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9709513187408447, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8522036671638489, + "num_tokens": 329705853.0, + "step": 8645 + }, + { + "epoch": 1.0998600686935505, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.530367136001587, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8691980838775635, + "num_tokens": 329741690.0, + "step": 8646 + }, + { + "epoch": 1.099987278972141, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8693617582321167, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8552584648132324, + "num_tokens": 329781548.0, + "step": 8647 + }, + { + "epoch": 1.1001144892507315, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.730158805847168, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8693014979362488, + "num_tokens": 329822304.0, + "step": 8648 + }, + { + "epoch": 1.100241699529322, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9422812461853027, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.862777829170227, + "num_tokens": 329856065.0, + "step": 8649 + }, + { + "epoch": 1.1003689098079126, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9342817068099976, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8467006683349609, + "num_tokens": 329895002.0, + "step": 8650 + }, + { + "epoch": 1.100496120086503, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7957440614700317, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8561018705368042, + "num_tokens": 329938810.0, + "step": 8651 + }, + { + "epoch": 1.1006233303650934, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9680763483047485, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8749241232872009, + "num_tokens": 329976513.0, + "step": 8652 + }, + { + "epoch": 1.100750540643684, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.939575433731079, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8686760663986206, + "num_tokens": 330014013.0, + "step": 8653 + }, + { + "epoch": 1.1008777509222745, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8823392391204834, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8571211099624634, + "num_tokens": 330052577.0, + "step": 8654 + }, + { + "epoch": 1.101004961200865, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8073190450668335, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8605920672416687, + "num_tokens": 330091385.0, + "step": 8655 + }, + { + "epoch": 1.1011321714794555, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.3639235496520996, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8708823919296265, + "num_tokens": 330130285.0, + "step": 8656 + }, + { + "epoch": 1.101259381758046, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9601103067398071, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8489668369293213, + "num_tokens": 330166823.0, + "step": 8657 + }, + { + "epoch": 1.1013865920366366, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.943990707397461, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8642070293426514, + "num_tokens": 330200655.0, + "step": 8658 + }, + { + "epoch": 1.101513802315227, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.782504916191101, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8611116409301758, + "num_tokens": 330243656.0, + "step": 8659 + }, + { + "epoch": 1.1016410125938176, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9326753616333008, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8547288179397583, + "num_tokens": 330287200.0, + "step": 8660 + }, + { + "epoch": 1.1017682228724082, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7968114614486694, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8648929595947266, + "num_tokens": 330324794.0, + "step": 8661 + }, + { + "epoch": 1.1018954331509987, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8262938261032104, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8637334704399109, + "num_tokens": 330365297.0, + "step": 8662 + }, + { + "epoch": 1.102022643429589, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7155635356903076, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8792321681976318, + "num_tokens": 330408046.0, + "step": 8663 + }, + { + "epoch": 1.1021498537081795, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7445340156555176, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8588305711746216, + "num_tokens": 330447815.0, + "step": 8664 + }, + { + "epoch": 1.10227706398677, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.773612380027771, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8511637449264526, + "num_tokens": 330487796.0, + "step": 8665 + }, + { + "epoch": 1.1024042742653606, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.6441447734832764, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8735355138778687, + "num_tokens": 330531155.0, + "step": 8666 + }, + { + "epoch": 1.1025314845439511, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8232123851776123, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8703938722610474, + "num_tokens": 330568611.0, + "step": 8667 + }, + { + "epoch": 1.1026586948225416, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.460892915725708, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8571228384971619, + "num_tokens": 330608085.0, + "step": 8668 + }, + { + "epoch": 1.1027859051011322, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.994994044303894, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8455386161804199, + "num_tokens": 330644776.0, + "step": 8669 + }, + { + "epoch": 1.1029131153797227, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.889648675918579, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8519484996795654, + "num_tokens": 330686401.0, + "step": 8670 + }, + { + "epoch": 1.1030403256583132, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9479087591171265, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8671520948410034, + "num_tokens": 330723115.0, + "step": 8671 + }, + { + "epoch": 1.1031675359369038, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7661527395248413, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8776314854621887, + "num_tokens": 330761170.0, + "step": 8672 + }, + { + "epoch": 1.1032947462154943, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.055783748626709, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8626695275306702, + "num_tokens": 330790230.0, + "step": 8673 + }, + { + "epoch": 1.1034219564940848, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.897060513496399, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8740747570991516, + "num_tokens": 330826178.0, + "step": 8674 + }, + { + "epoch": 1.1035491667726753, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.009150743484497, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8690979480743408, + "num_tokens": 330866341.0, + "step": 8675 + }, + { + "epoch": 1.1036763770512656, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.012462615966797, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.84816575050354, + "num_tokens": 330903955.0, + "step": 8676 + }, + { + "epoch": 1.1038035873298562, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.8487555980682373, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8584177494049072, + "num_tokens": 330943024.0, + "step": 8677 + }, + { + "epoch": 1.1039307976084467, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8503749370574951, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8563961982727051, + "num_tokens": 330981004.0, + "step": 8678 + }, + { + "epoch": 1.1040580078870372, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7412970066070557, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.860559344291687, + "num_tokens": 331021532.0, + "step": 8679 + }, + { + "epoch": 1.1041852181656278, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8222609758377075, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8685942888259888, + "num_tokens": 331058010.0, + "step": 8680 + }, + { + "epoch": 1.1043124284442183, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9924588203430176, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8711202144622803, + "num_tokens": 331090713.0, + "step": 8681 + }, + { + "epoch": 1.1044396387228088, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8109593391418457, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8682870864868164, + "num_tokens": 331127532.0, + "step": 8682 + }, + { + "epoch": 1.1045668490013993, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 16.606000900268555, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8413019776344299, + "num_tokens": 331172088.0, + "step": 8683 + }, + { + "epoch": 1.1046940592799899, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1507515907287598, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.856927216053009, + "num_tokens": 331213419.0, + "step": 8684 + }, + { + "epoch": 1.1048212695585804, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9950095415115356, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8619555234909058, + "num_tokens": 331250102.0, + "step": 8685 + }, + { + "epoch": 1.104948479837171, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9825650453567505, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8644269108772278, + "num_tokens": 331284597.0, + "step": 8686 + }, + { + "epoch": 1.1050756901157615, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8645941019058228, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8556815981864929, + "num_tokens": 331330688.0, + "step": 8687 + }, + { + "epoch": 1.1052029003943518, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9272615909576416, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.865039587020874, + "num_tokens": 331368777.0, + "step": 8688 + }, + { + "epoch": 1.1053301106729423, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7292451858520508, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.867012619972229, + "num_tokens": 331414369.0, + "step": 8689 + }, + { + "epoch": 1.1054573209515328, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9583160877227783, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8678327798843384, + "num_tokens": 331452788.0, + "step": 8690 + }, + { + "epoch": 1.1055845312301233, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9708486795425415, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.850982666015625, + "num_tokens": 331489575.0, + "step": 8691 + }, + { + "epoch": 1.1057117415087139, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8852858543395996, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8798322677612305, + "num_tokens": 331529129.0, + "step": 8692 + }, + { + "epoch": 1.1058389517873044, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.012716054916382, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8572273254394531, + "num_tokens": 331564989.0, + "step": 8693 + }, + { + "epoch": 1.105966162065895, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9431029558181763, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8657601475715637, + "num_tokens": 331602561.0, + "step": 8694 + }, + { + "epoch": 1.1060933723444855, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8554106950759888, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.862643837928772, + "num_tokens": 331646902.0, + "step": 8695 + }, + { + "epoch": 1.106220582623076, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8629761934280396, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8632619380950928, + "num_tokens": 331682347.0, + "step": 8696 + }, + { + "epoch": 1.1063477929016665, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.785745620727539, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8546445369720459, + "num_tokens": 331725813.0, + "step": 8697 + }, + { + "epoch": 1.106475003180257, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.965075135231018, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8423293828964233, + "num_tokens": 331767856.0, + "step": 8698 + }, + { + "epoch": 1.1066022134588476, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.813277006149292, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8708491325378418, + "num_tokens": 331804238.0, + "step": 8699 + }, + { + "epoch": 1.106729423737438, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8470886945724487, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8630889654159546, + "num_tokens": 331839963.0, + "step": 8700 + }, + { + "epoch": 1.1068566340160284, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.803200125694275, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8659547567367554, + "num_tokens": 331881563.0, + "step": 8701 + }, + { + "epoch": 1.106983844294619, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7458908557891846, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8584740161895752, + "num_tokens": 331925540.0, + "step": 8702 + }, + { + "epoch": 1.1071110545732095, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9129629135131836, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8702768683433533, + "num_tokens": 331963171.0, + "step": 8703 + }, + { + "epoch": 1.1072382648518, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8311848640441895, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8828068971633911, + "num_tokens": 331998310.0, + "step": 8704 + }, + { + "epoch": 1.1073654751303905, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8758883476257324, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8674793243408203, + "num_tokens": 332034181.0, + "step": 8705 + }, + { + "epoch": 1.107492685408981, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8786075115203857, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8722565770149231, + "num_tokens": 332075368.0, + "step": 8706 + }, + { + "epoch": 1.1076198956875716, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8659783601760864, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8736510276794434, + "num_tokens": 332113281.0, + "step": 8707 + }, + { + "epoch": 1.107747105966162, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7990953922271729, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8542608618736267, + "num_tokens": 332154206.0, + "step": 8708 + }, + { + "epoch": 1.1078743162447526, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.915081262588501, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8476127982139587, + "num_tokens": 332195409.0, + "step": 8709 + }, + { + "epoch": 1.1080015265233432, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8426692485809326, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8709255456924438, + "num_tokens": 332233836.0, + "step": 8710 + }, + { + "epoch": 1.1081287368019337, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8597779273986816, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8674269914627075, + "num_tokens": 332274167.0, + "step": 8711 + }, + { + "epoch": 1.108255947080524, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9044792652130127, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8588153123855591, + "num_tokens": 332315856.0, + "step": 8712 + }, + { + "epoch": 1.1083831573591145, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8186975717544556, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8613648414611816, + "num_tokens": 332366365.0, + "step": 8713 + }, + { + "epoch": 1.108510367637705, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.041868209838867, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8588135838508606, + "num_tokens": 332400411.0, + "step": 8714 + }, + { + "epoch": 1.1086375779162956, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7908837795257568, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8653866648674011, + "num_tokens": 332441959.0, + "step": 8715 + }, + { + "epoch": 1.108764788194886, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0091488361358643, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8496273159980774, + "num_tokens": 332479545.0, + "step": 8716 + }, + { + "epoch": 1.1088919984734766, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8251699209213257, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8729743957519531, + "num_tokens": 332516240.0, + "step": 8717 + }, + { + "epoch": 1.1090192087520672, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9304876327514648, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8666250705718994, + "num_tokens": 332553488.0, + "step": 8718 + }, + { + "epoch": 1.1091464190306577, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.056300401687622, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8529407978057861, + "num_tokens": 332588646.0, + "step": 8719 + }, + { + "epoch": 1.1092736293092482, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.900266408920288, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8529071807861328, + "num_tokens": 332629103.0, + "step": 8720 + }, + { + "epoch": 1.1094008395878387, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0012638568878174, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8675004839897156, + "num_tokens": 332664820.0, + "step": 8721 + }, + { + "epoch": 1.1095280498664293, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7994587421417236, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8648493885993958, + "num_tokens": 332699772.0, + "step": 8722 + }, + { + "epoch": 1.1096552601450198, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.048976421356201, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.870806097984314, + "num_tokens": 332733478.0, + "step": 8723 + }, + { + "epoch": 1.1097824704236103, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7382301092147827, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.858812689781189, + "num_tokens": 332774733.0, + "step": 8724 + }, + { + "epoch": 1.1099096807022006, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9368120431900024, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8392369747161865, + "num_tokens": 332812970.0, + "step": 8725 + }, + { + "epoch": 1.1100368909807912, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.870041012763977, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8583012223243713, + "num_tokens": 332854974.0, + "step": 8726 + }, + { + "epoch": 1.1101641012593817, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.959523320198059, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.847679853439331, + "num_tokens": 332886462.0, + "step": 8727 + }, + { + "epoch": 1.1102913115379722, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9196960926055908, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8584131002426147, + "num_tokens": 332928614.0, + "step": 8728 + }, + { + "epoch": 1.1104185218165628, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0136165618896484, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8637388348579407, + "num_tokens": 332962447.0, + "step": 8729 + }, + { + "epoch": 1.1105457320951533, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0779147148132324, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8536170721054077, + "num_tokens": 332991395.0, + "step": 8730 + }, + { + "epoch": 1.1106729423737438, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.139979362487793, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8551478981971741, + "num_tokens": 333027852.0, + "step": 8731 + }, + { + "epoch": 1.1108001526523343, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9798802137374878, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8648167848587036, + "num_tokens": 333063337.0, + "step": 8732 + }, + { + "epoch": 1.1109273629309249, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.336437463760376, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8527168035507202, + "num_tokens": 333094632.0, + "step": 8733 + }, + { + "epoch": 1.1110545732095154, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.814336895942688, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8758083581924438, + "num_tokens": 333134071.0, + "step": 8734 + }, + { + "epoch": 1.111181783488106, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8562806844711304, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8680984973907471, + "num_tokens": 333171288.0, + "step": 8735 + }, + { + "epoch": 1.1113089937666965, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7258989810943604, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8689508438110352, + "num_tokens": 333212111.0, + "step": 8736 + }, + { + "epoch": 1.1114362040452868, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.01383638381958, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8673842549324036, + "num_tokens": 333247557.0, + "step": 8737 + }, + { + "epoch": 1.1115634143238773, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.884581446647644, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8615055680274963, + "num_tokens": 333278705.0, + "step": 8738 + }, + { + "epoch": 1.1116906246024678, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9468051195144653, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8649744987487793, + "num_tokens": 333316493.0, + "step": 8739 + }, + { + "epoch": 1.1118178348810583, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8178601264953613, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8646939992904663, + "num_tokens": 333360384.0, + "step": 8740 + }, + { + "epoch": 1.1119450451596489, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9207615852355957, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8621110916137695, + "num_tokens": 333395568.0, + "step": 8741 + }, + { + "epoch": 1.1120722554382394, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9113467931747437, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8663070797920227, + "num_tokens": 333429267.0, + "step": 8742 + }, + { + "epoch": 1.11219946571683, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8688465356826782, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8681523203849792, + "num_tokens": 333469908.0, + "step": 8743 + }, + { + "epoch": 1.1123266759954205, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.828233003616333, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.865500271320343, + "num_tokens": 333507865.0, + "step": 8744 + }, + { + "epoch": 1.112453886274011, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.99732506275177, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8739114999771118, + "num_tokens": 333544011.0, + "step": 8745 + }, + { + "epoch": 1.1125810965526015, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9078863859176636, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8656183481216431, + "num_tokens": 333581949.0, + "step": 8746 + }, + { + "epoch": 1.112708306831192, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9113937616348267, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8709102272987366, + "num_tokens": 333615994.0, + "step": 8747 + }, + { + "epoch": 1.1128355171097826, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9876230955123901, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8781803846359253, + "num_tokens": 333654454.0, + "step": 8748 + }, + { + "epoch": 1.112962727388373, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8852641582489014, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8541209697723389, + "num_tokens": 333700206.0, + "step": 8749 + }, + { + "epoch": 1.1130899376669634, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7805284261703491, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8661747574806213, + "num_tokens": 333737538.0, + "step": 8750 + }, + { + "epoch": 1.113217147945554, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9796180725097656, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8643990755081177, + "num_tokens": 333771237.0, + "step": 8751 + }, + { + "epoch": 1.1133443582241445, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.911887288093567, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8599191308021545, + "num_tokens": 333806522.0, + "step": 8752 + }, + { + "epoch": 1.113471568502735, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8432114124298096, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8620275259017944, + "num_tokens": 333846656.0, + "step": 8753 + }, + { + "epoch": 1.1135987787813255, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8567160367965698, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8538079261779785, + "num_tokens": 333886294.0, + "step": 8754 + }, + { + "epoch": 1.113725989059916, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.861804485321045, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8573223352432251, + "num_tokens": 333923868.0, + "step": 8755 + }, + { + "epoch": 1.1138531993385066, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9220372438430786, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8672310709953308, + "num_tokens": 333957187.0, + "step": 8756 + }, + { + "epoch": 1.113980409617097, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.4907138347625732, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8777551651000977, + "num_tokens": 333999061.0, + "step": 8757 + }, + { + "epoch": 1.1141076198956876, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7848286628723145, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.857661783695221, + "num_tokens": 334043595.0, + "step": 8758 + }, + { + "epoch": 1.1142348301742782, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9130980968475342, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8680720925331116, + "num_tokens": 334078487.0, + "step": 8759 + }, + { + "epoch": 1.1143620404528687, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7233368158340454, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8703093528747559, + "num_tokens": 334119610.0, + "step": 8760 + }, + { + "epoch": 1.114489250731459, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.824650526046753, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8580727577209473, + "num_tokens": 334160149.0, + "step": 8761 + }, + { + "epoch": 1.1146164610100495, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.870841145515442, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8643739223480225, + "num_tokens": 334198667.0, + "step": 8762 + }, + { + "epoch": 1.11474367128864, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0892837047576904, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8650421500205994, + "num_tokens": 334233838.0, + "step": 8763 + }, + { + "epoch": 1.1148708815672306, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.709815263748169, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8544391989707947, + "num_tokens": 334275703.0, + "step": 8764 + }, + { + "epoch": 1.114998091845821, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9605801105499268, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8764404654502869, + "num_tokens": 334311119.0, + "step": 8765 + }, + { + "epoch": 1.1151253021244116, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9223065376281738, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8555630445480347, + "num_tokens": 334348298.0, + "step": 8766 + }, + { + "epoch": 1.1152525124030022, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9365644454956055, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8501645922660828, + "num_tokens": 334386986.0, + "step": 8767 + }, + { + "epoch": 1.1153797226815927, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8673597574234009, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8666142225265503, + "num_tokens": 334423263.0, + "step": 8768 + }, + { + "epoch": 1.1155069329601832, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9070836305618286, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8786385655403137, + "num_tokens": 334460363.0, + "step": 8769 + }, + { + "epoch": 1.1156341432387737, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8715356588363647, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.866524338722229, + "num_tokens": 334501883.0, + "step": 8770 + }, + { + "epoch": 1.1157613535173643, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0348143577575684, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8628279566764832, + "num_tokens": 334536242.0, + "step": 8771 + }, + { + "epoch": 1.1158885637959548, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0339298248291016, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8749024868011475, + "num_tokens": 334569685.0, + "step": 8772 + }, + { + "epoch": 1.1160157740745453, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0242958068847656, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.866877019405365, + "num_tokens": 334604946.0, + "step": 8773 + }, + { + "epoch": 1.1161429843531356, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.3044629096984863, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8682522177696228, + "num_tokens": 334645495.0, + "step": 8774 + }, + { + "epoch": 1.1162701946317262, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.95896315574646, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8569536805152893, + "num_tokens": 334684239.0, + "step": 8775 + }, + { + "epoch": 1.1163974049103167, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.162475109100342, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8688619136810303, + "num_tokens": 334713799.0, + "step": 8776 + }, + { + "epoch": 1.1165246151889072, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.816800594329834, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8846679925918579, + "num_tokens": 334753374.0, + "step": 8777 + }, + { + "epoch": 1.1166518254674977, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8022865056991577, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.863710343837738, + "num_tokens": 334792867.0, + "step": 8778 + }, + { + "epoch": 1.1167790357460883, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8681674003601074, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8598135709762573, + "num_tokens": 334832664.0, + "step": 8779 + }, + { + "epoch": 1.1169062460246788, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8460489511489868, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8567032814025879, + "num_tokens": 334876763.0, + "step": 8780 + }, + { + "epoch": 1.1170334563032693, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0295872688293457, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8632948994636536, + "num_tokens": 334909431.0, + "step": 8781 + }, + { + "epoch": 1.1171606665818599, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1452767848968506, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8610771894454956, + "num_tokens": 334941939.0, + "step": 8782 + }, + { + "epoch": 1.1172878768604504, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.3658647537231445, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8726571798324585, + "num_tokens": 334981338.0, + "step": 8783 + }, + { + "epoch": 1.117415087139041, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0331411361694336, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8640800714492798, + "num_tokens": 335020660.0, + "step": 8784 + }, + { + "epoch": 1.1175422974176314, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0754787921905518, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8661545515060425, + "num_tokens": 335057377.0, + "step": 8785 + }, + { + "epoch": 1.1176695076962218, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9409866333007812, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8540692329406738, + "num_tokens": 335092912.0, + "step": 8786 + }, + { + "epoch": 1.1177967179748123, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0378499031066895, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8784427046775818, + "num_tokens": 335129157.0, + "step": 8787 + }, + { + "epoch": 1.1179239282534028, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.910788655281067, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8634680509567261, + "num_tokens": 335168065.0, + "step": 8788 + }, + { + "epoch": 1.1180511385319933, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.742746114730835, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8618333339691162, + "num_tokens": 335218876.0, + "step": 8789 + }, + { + "epoch": 1.1181783488105839, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7888288497924805, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8729333877563477, + "num_tokens": 335256300.0, + "step": 8790 + }, + { + "epoch": 1.1183055590891744, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9811955690383911, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8732739090919495, + "num_tokens": 335291035.0, + "step": 8791 + }, + { + "epoch": 1.118432769367765, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9024875164031982, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8748397827148438, + "num_tokens": 335330092.0, + "step": 8792 + }, + { + "epoch": 1.1185599796463555, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0209310054779053, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8491197824478149, + "num_tokens": 335367146.0, + "step": 8793 + }, + { + "epoch": 1.118687189924946, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.3553223609924316, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8636957406997681, + "num_tokens": 335405069.0, + "step": 8794 + }, + { + "epoch": 1.1188144002035365, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9147273302078247, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8539397120475769, + "num_tokens": 335443779.0, + "step": 8795 + }, + { + "epoch": 1.118941610482127, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.849120020866394, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8550203442573547, + "num_tokens": 335485978.0, + "step": 8796 + }, + { + "epoch": 1.1190688207607176, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8273500204086304, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8676132559776306, + "num_tokens": 335530375.0, + "step": 8797 + }, + { + "epoch": 1.119196031039308, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8171693086624146, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8685501217842102, + "num_tokens": 335569109.0, + "step": 8798 + }, + { + "epoch": 1.1193232413178984, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.4608469009399414, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8490137457847595, + "num_tokens": 335599283.0, + "step": 8799 + }, + { + "epoch": 1.119450451596489, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.131300687789917, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8702126741409302, + "num_tokens": 335634792.0, + "step": 8800 + }, + { + "epoch": 1.1195776618750795, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9478325843811035, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8756240606307983, + "num_tokens": 335672869.0, + "step": 8801 + }, + { + "epoch": 1.11970487215367, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8282108306884766, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.868738055229187, + "num_tokens": 335714996.0, + "step": 8802 + }, + { + "epoch": 1.1198320824322605, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8021217584609985, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8545867204666138, + "num_tokens": 335756884.0, + "step": 8803 + }, + { + "epoch": 1.119959292710851, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 20.47403335571289, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8728179931640625, + "num_tokens": 335793935.0, + "step": 8804 + }, + { + "epoch": 1.1200865029894416, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.209909200668335, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8764431476593018, + "num_tokens": 335834169.0, + "step": 8805 + }, + { + "epoch": 1.120213713268032, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8803080320358276, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.882365345954895, + "num_tokens": 335872302.0, + "step": 8806 + }, + { + "epoch": 1.1203409235466226, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0145347118377686, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8636264204978943, + "num_tokens": 335908074.0, + "step": 8807 + }, + { + "epoch": 1.1204681338252132, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9633127450942993, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8746394515037537, + "num_tokens": 335946268.0, + "step": 8808 + }, + { + "epoch": 1.1205953441038037, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7597768306732178, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8649565577507019, + "num_tokens": 335985364.0, + "step": 8809 + }, + { + "epoch": 1.120722554382394, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8803163766860962, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.867179811000824, + "num_tokens": 336025300.0, + "step": 8810 + }, + { + "epoch": 1.1208497646609845, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8443416357040405, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8660985231399536, + "num_tokens": 336062105.0, + "step": 8811 + }, + { + "epoch": 1.120976974939575, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9075958728790283, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8540380597114563, + "num_tokens": 336102944.0, + "step": 8812 + }, + { + "epoch": 1.1211041852181656, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.765710711479187, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8902148008346558, + "num_tokens": 336142964.0, + "step": 8813 + }, + { + "epoch": 1.121231395496756, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.756273865699768, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8664096593856812, + "num_tokens": 336186962.0, + "step": 8814 + }, + { + "epoch": 1.1213586057753466, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9551619291305542, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8732771873474121, + "num_tokens": 336220775.0, + "step": 8815 + }, + { + "epoch": 1.1214858160539372, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.786513090133667, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8602200746536255, + "num_tokens": 336263067.0, + "step": 8816 + }, + { + "epoch": 1.1216130263325277, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.7645081281661987, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8678436279296875, + "num_tokens": 336308463.0, + "step": 8817 + }, + { + "epoch": 1.1217402366111182, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9986991882324219, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8771616220474243, + "num_tokens": 336346965.0, + "step": 8818 + }, + { + "epoch": 1.1218674468897087, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.032097101211548, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8659331798553467, + "num_tokens": 336382114.0, + "step": 8819 + }, + { + "epoch": 1.1219946571682993, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.073223352432251, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8684716820716858, + "num_tokens": 336414020.0, + "step": 8820 + }, + { + "epoch": 1.1221218674468898, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9246702194213867, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.861213207244873, + "num_tokens": 336452522.0, + "step": 8821 + }, + { + "epoch": 1.1222490777254803, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9457306861877441, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8746715784072876, + "num_tokens": 336484593.0, + "step": 8822 + }, + { + "epoch": 1.1223762880040706, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1134748458862305, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.846160888671875, + "num_tokens": 336518964.0, + "step": 8823 + }, + { + "epoch": 1.1225034982826612, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.528301477432251, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8499578833580017, + "num_tokens": 336556320.0, + "step": 8824 + }, + { + "epoch": 1.1226307085612517, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.0788979530334473, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8544056415557861, + "num_tokens": 336595473.0, + "step": 8825 + }, + { + "epoch": 1.1227579188398422, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0254712104797363, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.863774836063385, + "num_tokens": 336631793.0, + "step": 8826 + }, + { + "epoch": 1.1228851291184327, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.099285840988159, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.848536491394043, + "num_tokens": 336671230.0, + "step": 8827 + }, + { + "epoch": 1.1230123393970233, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8048747777938843, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8644825220108032, + "num_tokens": 336712015.0, + "step": 8828 + }, + { + "epoch": 1.1231395496756138, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.2232706546783447, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8488500714302063, + "num_tokens": 336741206.0, + "step": 8829 + }, + { + "epoch": 1.1232667599542043, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.150567054748535, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8567715883255005, + "num_tokens": 336778838.0, + "step": 8830 + }, + { + "epoch": 1.1233939702327949, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8599637746810913, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8699069023132324, + "num_tokens": 336815471.0, + "step": 8831 + }, + { + "epoch": 1.1235211805113854, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7812894582748413, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8657983541488647, + "num_tokens": 336850870.0, + "step": 8832 + }, + { + "epoch": 1.123648390789976, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9175090789794922, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8687703609466553, + "num_tokens": 336891779.0, + "step": 8833 + }, + { + "epoch": 1.1237756010685664, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9052852392196655, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8742832541465759, + "num_tokens": 336931171.0, + "step": 8834 + }, + { + "epoch": 1.1239028113471567, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9056657552719116, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8734995126724243, + "num_tokens": 336963080.0, + "step": 8835 + }, + { + "epoch": 1.1240300216257473, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9074232578277588, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8493467569351196, + "num_tokens": 337000817.0, + "step": 8836 + }, + { + "epoch": 1.1241572319043378, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9223439693450928, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8574556112289429, + "num_tokens": 337037249.0, + "step": 8837 + }, + { + "epoch": 1.1242844421829283, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7872657775878906, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8719051480293274, + "num_tokens": 337077752.0, + "step": 8838 + }, + { + "epoch": 1.1244116524615189, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0915915966033936, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8601104617118835, + "num_tokens": 337119649.0, + "step": 8839 + }, + { + "epoch": 1.1245388627401094, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0035696029663086, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8472985625267029, + "num_tokens": 337154376.0, + "step": 8840 + }, + { + "epoch": 1.1246660730187, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8848994970321655, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8722960352897644, + "num_tokens": 337193377.0, + "step": 8841 + }, + { + "epoch": 1.1247932832972904, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.888106346130371, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8528045415878296, + "num_tokens": 337233176.0, + "step": 8842 + }, + { + "epoch": 1.124920493575881, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9842984676361084, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.849468469619751, + "num_tokens": 337262995.0, + "step": 8843 + }, + { + "epoch": 1.1250477038544715, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7591652870178223, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8627516627311707, + "num_tokens": 337302466.0, + "step": 8844 + }, + { + "epoch": 1.125174914133062, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.809322714805603, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8749127388000488, + "num_tokens": 337341197.0, + "step": 8845 + }, + { + "epoch": 1.1253021244116526, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9994378089904785, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8669368624687195, + "num_tokens": 337375602.0, + "step": 8846 + }, + { + "epoch": 1.125429334690243, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.090209484100342, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8541061878204346, + "num_tokens": 337406685.0, + "step": 8847 + }, + { + "epoch": 1.1255565449688334, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8631908893585205, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8589214086532593, + "num_tokens": 337447364.0, + "step": 8848 + }, + { + "epoch": 1.125683755247424, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0350961685180664, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8634143471717834, + "num_tokens": 337487840.0, + "step": 8849 + }, + { + "epoch": 1.1258109655260145, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8531917333602905, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8670140504837036, + "num_tokens": 337525117.0, + "step": 8850 + }, + { + "epoch": 1.125938175804605, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 3.6339926719665527, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8560008406639099, + "num_tokens": 337567024.0, + "step": 8851 + }, + { + "epoch": 1.1260653860831955, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.391305923461914, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8619816899299622, + "num_tokens": 337600117.0, + "step": 8852 + }, + { + "epoch": 1.126192596361786, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8072758913040161, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8478670716285706, + "num_tokens": 337637188.0, + "step": 8853 + }, + { + "epoch": 1.1263198066403766, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9000365734100342, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8593450784683228, + "num_tokens": 337676492.0, + "step": 8854 + }, + { + "epoch": 1.126447016918967, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.867639422416687, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.878040075302124, + "num_tokens": 337713468.0, + "step": 8855 + }, + { + "epoch": 1.1265742271975576, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9237464666366577, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8492431640625, + "num_tokens": 337750944.0, + "step": 8856 + }, + { + "epoch": 1.1267014374761481, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8865712881088257, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8782448768615723, + "num_tokens": 337793802.0, + "step": 8857 + }, + { + "epoch": 1.1268286477547387, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8990845680236816, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8766626119613647, + "num_tokens": 337828550.0, + "step": 8858 + }, + { + "epoch": 1.126955858033329, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0375094413757324, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8720261454582214, + "num_tokens": 337862168.0, + "step": 8859 + }, + { + "epoch": 1.1270830683119195, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8753360509872437, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.860158383846283, + "num_tokens": 337898837.0, + "step": 8860 + }, + { + "epoch": 1.12721027859051, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0431253910064697, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8501244187355042, + "num_tokens": 337939608.0, + "step": 8861 + }, + { + "epoch": 1.1273374888691006, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 80.5355224609375, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8626809120178223, + "num_tokens": 337983412.0, + "step": 8862 + }, + { + "epoch": 1.127464699147691, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.578890800476074, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8730567693710327, + "num_tokens": 338022171.0, + "step": 8863 + }, + { + "epoch": 1.1275919094262816, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.148136615753174, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8596544861793518, + "num_tokens": 338058838.0, + "step": 8864 + }, + { + "epoch": 1.1277191197048722, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0342583656311035, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8585423827171326, + "num_tokens": 338095080.0, + "step": 8865 + }, + { + "epoch": 1.1278463299834627, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8731145858764648, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8460899591445923, + "num_tokens": 338134206.0, + "step": 8866 + }, + { + "epoch": 1.1279735402620532, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 3.2011170387268066, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8691738843917847, + "num_tokens": 338173794.0, + "step": 8867 + }, + { + "epoch": 1.1281007505406437, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0653109550476074, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8604243993759155, + "num_tokens": 338206591.0, + "step": 8868 + }, + { + "epoch": 1.1282279608192343, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9782192707061768, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8766651153564453, + "num_tokens": 338238728.0, + "step": 8869 + }, + { + "epoch": 1.1283551710978248, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7719812393188477, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8731279373168945, + "num_tokens": 338282372.0, + "step": 8870 + }, + { + "epoch": 1.1284823813764153, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9154736995697021, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8578681349754333, + "num_tokens": 338319175.0, + "step": 8871 + }, + { + "epoch": 1.1286095916550056, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.907966136932373, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8619036674499512, + "num_tokens": 338357967.0, + "step": 8872 + }, + { + "epoch": 1.1287368019335962, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9266196489334106, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8690453767776489, + "num_tokens": 338391431.0, + "step": 8873 + }, + { + "epoch": 1.1288640122121867, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9264280796051025, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8713014125823975, + "num_tokens": 338426298.0, + "step": 8874 + }, + { + "epoch": 1.1289912224907772, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9264053106307983, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8452891111373901, + "num_tokens": 338468147.0, + "step": 8875 + }, + { + "epoch": 1.1291184327693677, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.896692156791687, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8466697931289673, + "num_tokens": 338517095.0, + "step": 8876 + }, + { + "epoch": 1.1292456430479583, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.0784966945648193, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.869124710559845, + "num_tokens": 338549925.0, + "step": 8877 + }, + { + "epoch": 1.1293728533265488, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8897844552993774, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8671263456344604, + "num_tokens": 338583966.0, + "step": 8878 + }, + { + "epoch": 1.1295000636051393, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8168803453445435, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8682944178581238, + "num_tokens": 338622715.0, + "step": 8879 + }, + { + "epoch": 1.1296272738837299, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0521774291992188, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8501735925674438, + "num_tokens": 338659920.0, + "step": 8880 + }, + { + "epoch": 1.1297544841623204, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9051673412322998, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8575019836425781, + "num_tokens": 338698183.0, + "step": 8881 + }, + { + "epoch": 1.129881694440911, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.973296046257019, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8518929481506348, + "num_tokens": 338735853.0, + "step": 8882 + }, + { + "epoch": 1.1300089047195012, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1863515377044678, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8533826470375061, + "num_tokens": 338771714.0, + "step": 8883 + }, + { + "epoch": 1.1301361149980917, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8165283203125, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8813172578811646, + "num_tokens": 338813823.0, + "step": 8884 + }, + { + "epoch": 1.1302633252766823, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.220221996307373, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8678646087646484, + "num_tokens": 338858137.0, + "step": 8885 + }, + { + "epoch": 1.1303905355552728, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.935640811920166, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8518304824829102, + "num_tokens": 338893653.0, + "step": 8886 + }, + { + "epoch": 1.1305177458338633, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.916353464126587, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8441742658615112, + "num_tokens": 338931230.0, + "step": 8887 + }, + { + "epoch": 1.1306449561124539, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.5999834537506104, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8638923168182373, + "num_tokens": 338966921.0, + "step": 8888 + }, + { + "epoch": 1.1307721663910444, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0737533569335938, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8445583581924438, + "num_tokens": 339000957.0, + "step": 8889 + }, + { + "epoch": 1.130899376669635, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.172067880630493, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8553508520126343, + "num_tokens": 339035876.0, + "step": 8890 + }, + { + "epoch": 1.1310265869482254, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9208862781524658, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.863910436630249, + "num_tokens": 339073700.0, + "step": 8891 + }, + { + "epoch": 1.131153797226816, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.144533395767212, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8669448494911194, + "num_tokens": 339108222.0, + "step": 8892 + }, + { + "epoch": 1.1312810075054065, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.064310073852539, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8749037384986877, + "num_tokens": 339145749.0, + "step": 8893 + }, + { + "epoch": 1.131408217783997, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.056480646133423, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.848157525062561, + "num_tokens": 339183852.0, + "step": 8894 + }, + { + "epoch": 1.1315354280625876, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9632536172866821, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8773870468139648, + "num_tokens": 339218766.0, + "step": 8895 + }, + { + "epoch": 1.131662638341178, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9223697185516357, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8637338280677795, + "num_tokens": 339259932.0, + "step": 8896 + }, + { + "epoch": 1.1317898486197684, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8275123834609985, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8677483797073364, + "num_tokens": 339302201.0, + "step": 8897 + }, + { + "epoch": 1.131917058898359, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.960939645767212, + "learning_rate": 1e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.834525465965271, + "num_tokens": 339342091.0, + "step": 8898 + }, + { + "epoch": 1.1320442691769494, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 8.64664363861084, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8582189679145813, + "num_tokens": 339379951.0, + "step": 8899 + }, + { + "epoch": 1.13217147945554, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.94270920753479, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8572587966918945, + "num_tokens": 339422243.0, + "step": 8900 + }, + { + "epoch": 1.1322986897341305, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7284607887268066, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8780502080917358, + "num_tokens": 339464969.0, + "step": 8901 + }, + { + "epoch": 1.132425900012721, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8643356561660767, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8658492565155029, + "num_tokens": 339502382.0, + "step": 8902 + }, + { + "epoch": 1.1325531102913116, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.406524658203125, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8486965894699097, + "num_tokens": 339542875.0, + "step": 8903 + }, + { + "epoch": 1.132680320569902, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.2046868801116943, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8491426706314087, + "num_tokens": 339580776.0, + "step": 8904 + }, + { + "epoch": 1.1328075308484926, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.222632884979248, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.876183271408081, + "num_tokens": 339620195.0, + "step": 8905 + }, + { + "epoch": 1.1329347411270831, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7475768327713013, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8705013990402222, + "num_tokens": 339659234.0, + "step": 8906 + }, + { + "epoch": 1.1330619514056737, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8082876205444336, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8536940217018127, + "num_tokens": 339703259.0, + "step": 8907 + }, + { + "epoch": 1.133189161684264, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.3787918090820312, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.871526837348938, + "num_tokens": 339745901.0, + "step": 8908 + }, + { + "epoch": 1.1333163719628545, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.797949194908142, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8540112376213074, + "num_tokens": 339790417.0, + "step": 8909 + }, + { + "epoch": 1.133443582241445, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9013439416885376, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8568376302719116, + "num_tokens": 339827224.0, + "step": 8910 + }, + { + "epoch": 1.1335707925200356, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.0185182094573975, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.858735203742981, + "num_tokens": 339859932.0, + "step": 8911 + }, + { + "epoch": 1.133698002798626, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.989462971687317, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8513087630271912, + "num_tokens": 339902416.0, + "step": 8912 + }, + { + "epoch": 1.1338252130772166, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.875878930091858, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8708224892616272, + "num_tokens": 339940495.0, + "step": 8913 + }, + { + "epoch": 1.1339524233558071, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.7316312789916992, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8452622294425964, + "num_tokens": 339983010.0, + "step": 8914 + }, + { + "epoch": 1.1340796336343977, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9428869485855103, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8639010190963745, + "num_tokens": 340020683.0, + "step": 8915 + }, + { + "epoch": 1.1342068439129882, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.997012734413147, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8545675873756409, + "num_tokens": 340063179.0, + "step": 8916 + }, + { + "epoch": 1.1343340541915787, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.980238437652588, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8725489377975464, + "num_tokens": 340100166.0, + "step": 8917 + }, + { + "epoch": 1.1344612644701693, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8156697750091553, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8588734269142151, + "num_tokens": 340144325.0, + "step": 8918 + }, + { + "epoch": 1.1345884747487598, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.358991861343384, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8613731265068054, + "num_tokens": 340180563.0, + "step": 8919 + }, + { + "epoch": 1.1347156850273503, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8844969272613525, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.850479245185852, + "num_tokens": 340224621.0, + "step": 8920 + }, + { + "epoch": 1.1348428953059406, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8786587715148926, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8688555955886841, + "num_tokens": 340264810.0, + "step": 8921 + }, + { + "epoch": 1.1349701055845312, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9081436395645142, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8746657371520996, + "num_tokens": 340301158.0, + "step": 8922 + }, + { + "epoch": 1.1350973158631217, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.921579122543335, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8628436326980591, + "num_tokens": 340339806.0, + "step": 8923 + }, + { + "epoch": 1.1352245261417122, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.6979511976242065, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8678891658782959, + "num_tokens": 340386412.0, + "step": 8924 + }, + { + "epoch": 1.1353517364203027, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.762277603149414, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8725453019142151, + "num_tokens": 340431391.0, + "step": 8925 + }, + { + "epoch": 1.1354789466988933, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.063939332962036, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8664631843566895, + "num_tokens": 340465904.0, + "step": 8926 + }, + { + "epoch": 1.1356061569774838, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.967365026473999, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.868064284324646, + "num_tokens": 340504290.0, + "step": 8927 + }, + { + "epoch": 1.1357333672560743, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9849683046340942, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8568212985992432, + "num_tokens": 340539100.0, + "step": 8928 + }, + { + "epoch": 1.1358605775346649, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9506019353866577, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8779863119125366, + "num_tokens": 340569005.0, + "step": 8929 + }, + { + "epoch": 1.1359877878132554, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8591766357421875, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8624821901321411, + "num_tokens": 340613476.0, + "step": 8930 + }, + { + "epoch": 1.136114998091846, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9196399450302124, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8567619323730469, + "num_tokens": 340647982.0, + "step": 8931 + }, + { + "epoch": 1.1362422083704362, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.3299570083618164, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8408744931221008, + "num_tokens": 340683270.0, + "step": 8932 + }, + { + "epoch": 1.1363694186490267, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1227035522460938, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8663839101791382, + "num_tokens": 340719955.0, + "step": 8933 + }, + { + "epoch": 1.1364966289276173, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.023853302001953, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8635397553443909, + "num_tokens": 340753934.0, + "step": 8934 + }, + { + "epoch": 1.1366238392062078, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0610837936401367, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8696556091308594, + "num_tokens": 340787545.0, + "step": 8935 + }, + { + "epoch": 1.1367510494847983, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9197642803192139, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8727286458015442, + "num_tokens": 340826362.0, + "step": 8936 + }, + { + "epoch": 1.1368782597633889, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.022707223892212, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.867343008518219, + "num_tokens": 340857871.0, + "step": 8937 + }, + { + "epoch": 1.1370054700419794, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.854702353477478, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8687765598297119, + "num_tokens": 340897736.0, + "step": 8938 + }, + { + "epoch": 1.13713268032057, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9051848649978638, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8587273955345154, + "num_tokens": 340935774.0, + "step": 8939 + }, + { + "epoch": 1.1372598905991604, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.023473024368286, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8652709126472473, + "num_tokens": 340971497.0, + "step": 8940 + }, + { + "epoch": 1.137387100877751, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0129125118255615, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8631085753440857, + "num_tokens": 341016209.0, + "step": 8941 + }, + { + "epoch": 1.1375143111563415, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8949609994888306, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8580037355422974, + "num_tokens": 341052137.0, + "step": 8942 + }, + { + "epoch": 1.137641521434932, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9570636749267578, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8657974600791931, + "num_tokens": 341083417.0, + "step": 8943 + }, + { + "epoch": 1.1377687317135226, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.938357949256897, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8665720820426941, + "num_tokens": 341127798.0, + "step": 8944 + }, + { + "epoch": 1.137895941992113, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9616223573684692, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8500008583068848, + "num_tokens": 341173485.0, + "step": 8945 + }, + { + "epoch": 1.1380231522707034, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0344960689544678, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8520613312721252, + "num_tokens": 341212834.0, + "step": 8946 + }, + { + "epoch": 1.138150362549294, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.07155179977417, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.856377124786377, + "num_tokens": 341246497.0, + "step": 8947 + }, + { + "epoch": 1.1382775728278844, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9264246225357056, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.846782922744751, + "num_tokens": 341291498.0, + "step": 8948 + }, + { + "epoch": 1.138404783106475, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9867873191833496, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8560869693756104, + "num_tokens": 341328104.0, + "step": 8949 + }, + { + "epoch": 1.1385319933850655, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.381242036819458, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8748949766159058, + "num_tokens": 341359042.0, + "step": 8950 + }, + { + "epoch": 1.138659203663656, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.06044340133667, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8625797629356384, + "num_tokens": 341403529.0, + "step": 8951 + }, + { + "epoch": 1.1387864139422466, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.909868597984314, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8693426847457886, + "num_tokens": 341442509.0, + "step": 8952 + }, + { + "epoch": 1.138913624220837, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.00430965423584, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8614280223846436, + "num_tokens": 341483015.0, + "step": 8953 + }, + { + "epoch": 1.1390408344994276, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.026404619216919, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8623473644256592, + "num_tokens": 341519676.0, + "step": 8954 + }, + { + "epoch": 1.1391680447780181, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8014394044876099, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8628177642822266, + "num_tokens": 341563483.0, + "step": 8955 + }, + { + "epoch": 1.1392952550566087, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9507017135620117, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8540629148483276, + "num_tokens": 341598387.0, + "step": 8956 + }, + { + "epoch": 1.139422465335199, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8701037168502808, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8657035827636719, + "num_tokens": 341640253.0, + "step": 8957 + }, + { + "epoch": 1.1395496756137895, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7909345626831055, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8895949125289917, + "num_tokens": 341675788.0, + "step": 8958 + }, + { + "epoch": 1.13967688589238, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.817184567451477, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8633276224136353, + "num_tokens": 341716917.0, + "step": 8959 + }, + { + "epoch": 1.1398040961709706, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8075944185256958, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8625020980834961, + "num_tokens": 341758936.0, + "step": 8960 + }, + { + "epoch": 1.139931306449561, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.000622510910034, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8392103314399719, + "num_tokens": 341797612.0, + "step": 8961 + }, + { + "epoch": 1.1400585167281516, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8901512622833252, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8586527109146118, + "num_tokens": 341836061.0, + "step": 8962 + }, + { + "epoch": 1.1401857270067421, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.090837001800537, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8627845644950867, + "num_tokens": 341875465.0, + "step": 8963 + }, + { + "epoch": 1.1403129372853327, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8302545547485352, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8568772077560425, + "num_tokens": 341919795.0, + "step": 8964 + }, + { + "epoch": 1.1404401475639232, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9930354356765747, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8599193096160889, + "num_tokens": 341953291.0, + "step": 8965 + }, + { + "epoch": 1.1405673578425137, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.081010580062866, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8810311555862427, + "num_tokens": 341988364.0, + "step": 8966 + }, + { + "epoch": 1.1406945681211043, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.691138744354248, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8663203120231628, + "num_tokens": 342028478.0, + "step": 8967 + }, + { + "epoch": 1.1408217783996948, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8476238250732422, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8704401254653931, + "num_tokens": 342067348.0, + "step": 8968 + }, + { + "epoch": 1.1409489886782853, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8809430599212646, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8587918877601624, + "num_tokens": 342105845.0, + "step": 8969 + }, + { + "epoch": 1.1410761989568756, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9571170806884766, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8628399968147278, + "num_tokens": 342142072.0, + "step": 8970 + }, + { + "epoch": 1.1412034092354661, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8971928358078003, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8653097152709961, + "num_tokens": 342176592.0, + "step": 8971 + }, + { + "epoch": 1.1413306195140567, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8587250709533691, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8676913380622864, + "num_tokens": 342211896.0, + "step": 8972 + }, + { + "epoch": 1.1414578297926472, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8320372104644775, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8669254779815674, + "num_tokens": 342256078.0, + "step": 8973 + }, + { + "epoch": 1.1415850400712377, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8775016069412231, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8662967681884766, + "num_tokens": 342296766.0, + "step": 8974 + }, + { + "epoch": 1.1417122503498283, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8742573261260986, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8613342642784119, + "num_tokens": 342333689.0, + "step": 8975 + }, + { + "epoch": 1.1418394606284188, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0431625843048096, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8731122612953186, + "num_tokens": 342369605.0, + "step": 8976 + }, + { + "epoch": 1.1419666709070093, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.6809520721435547, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8719406127929688, + "num_tokens": 342410806.0, + "step": 8977 + }, + { + "epoch": 1.1420938811855998, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9046307802200317, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8892521262168884, + "num_tokens": 342446957.0, + "step": 8978 + }, + { + "epoch": 1.1422210914641904, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.071601629257202, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8498687148094177, + "num_tokens": 342480717.0, + "step": 8979 + }, + { + "epoch": 1.142348301742781, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7935867309570312, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8716741800308228, + "num_tokens": 342520431.0, + "step": 8980 + }, + { + "epoch": 1.1424755120213712, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.7562578916549683, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8601764440536499, + "num_tokens": 342561735.0, + "step": 8981 + }, + { + "epoch": 1.1426027222999617, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9415189027786255, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.875795841217041, + "num_tokens": 342593899.0, + "step": 8982 + }, + { + "epoch": 1.1427299325785523, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.0560622215270996, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.862730860710144, + "num_tokens": 342628044.0, + "step": 8983 + }, + { + "epoch": 1.1428571428571428, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.96932053565979, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8599556088447571, + "num_tokens": 342665018.0, + "step": 8984 + }, + { + "epoch": 1.1429843531357333, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0488638877868652, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8516637682914734, + "num_tokens": 342704832.0, + "step": 8985 + }, + { + "epoch": 1.1431115634143239, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0715458393096924, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.871483564376831, + "num_tokens": 342741598.0, + "step": 8986 + }, + { + "epoch": 1.1432387736929144, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.156161308288574, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.858694851398468, + "num_tokens": 342777302.0, + "step": 8987 + }, + { + "epoch": 1.143365983971505, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0763609409332275, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8445228338241577, + "num_tokens": 342809578.0, + "step": 8988 + }, + { + "epoch": 1.1434931942500954, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8432931900024414, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8633549809455872, + "num_tokens": 342845809.0, + "step": 8989 + }, + { + "epoch": 1.143620404528686, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8616530895233154, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8504496812820435, + "num_tokens": 342886118.0, + "step": 8990 + }, + { + "epoch": 1.1437476148072765, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.900773286819458, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8584086894989014, + "num_tokens": 342922517.0, + "step": 8991 + }, + { + "epoch": 1.143874825085867, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0241212844848633, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8564356565475464, + "num_tokens": 342959741.0, + "step": 8992 + }, + { + "epoch": 1.1440020353644575, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9144093990325928, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.875654935836792, + "num_tokens": 342991418.0, + "step": 8993 + }, + { + "epoch": 1.144129245643048, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9648305177688599, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8808621764183044, + "num_tokens": 343024263.0, + "step": 8994 + }, + { + "epoch": 1.1442564559216384, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9229203462600708, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8632954955101013, + "num_tokens": 343066388.0, + "step": 8995 + }, + { + "epoch": 1.144383666200229, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.875340223312378, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8649376034736633, + "num_tokens": 343107631.0, + "step": 8996 + }, + { + "epoch": 1.1445108764788194, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8759095668792725, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8700487613677979, + "num_tokens": 343143645.0, + "step": 8997 + }, + { + "epoch": 1.14463808675741, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.144752264022827, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.850021481513977, + "num_tokens": 343182076.0, + "step": 8998 + }, + { + "epoch": 1.1447652970360005, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.883150339126587, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8810024261474609, + "num_tokens": 343222808.0, + "step": 8999 + }, + { + "epoch": 1.144892507314591, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.899187445640564, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8589489459991455, + "num_tokens": 343268229.0, + "step": 9000 + }, + { + "epoch": 1.1450197175931816, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.937171220779419, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8548387885093689, + "num_tokens": 343304459.0, + "step": 9001 + }, + { + "epoch": 1.145146927871772, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0290701389312744, + "learning_rate": 1e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8245710134506226, + "num_tokens": 343343211.0, + "step": 9002 + }, + { + "epoch": 1.1452741381503626, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.758957028388977, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8594552874565125, + "num_tokens": 343388498.0, + "step": 9003 + }, + { + "epoch": 1.1454013484289531, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8681111335754395, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8602871894836426, + "num_tokens": 343430101.0, + "step": 9004 + }, + { + "epoch": 1.1455285587075437, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.937612771987915, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8511323928833008, + "num_tokens": 343466454.0, + "step": 9005 + }, + { + "epoch": 1.145655768986134, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8149046897888184, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8780621290206909, + "num_tokens": 343506960.0, + "step": 9006 + }, + { + "epoch": 1.1457829792647245, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8853435516357422, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8712770938873291, + "num_tokens": 343541133.0, + "step": 9007 + }, + { + "epoch": 1.145910189543315, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0453429222106934, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8717197179794312, + "num_tokens": 343580266.0, + "step": 9008 + }, + { + "epoch": 1.1460373998219056, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.023965358734131, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8607276678085327, + "num_tokens": 343611691.0, + "step": 9009 + }, + { + "epoch": 1.146164610100496, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.91658616065979, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8685945272445679, + "num_tokens": 343649355.0, + "step": 9010 + }, + { + "epoch": 1.1462918203790866, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8387144804000854, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8761513829231262, + "num_tokens": 343681606.0, + "step": 9011 + }, + { + "epoch": 1.1464190306576771, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.3983614444732666, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8735005855560303, + "num_tokens": 343721592.0, + "step": 9012 + }, + { + "epoch": 1.1465462409362677, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9098166227340698, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8597418665885925, + "num_tokens": 343764084.0, + "step": 9013 + }, + { + "epoch": 1.1466734512148582, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0125648975372314, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8644214868545532, + "num_tokens": 343801213.0, + "step": 9014 + }, + { + "epoch": 1.1468006614934487, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9755914211273193, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.863474428653717, + "num_tokens": 343839761.0, + "step": 9015 + }, + { + "epoch": 1.1469278717720393, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9620769023895264, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8634147644042969, + "num_tokens": 343877792.0, + "step": 9016 + }, + { + "epoch": 1.1470550820506298, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8036588430404663, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8694138526916504, + "num_tokens": 343915107.0, + "step": 9017 + }, + { + "epoch": 1.1471822923292203, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.104759693145752, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.867369532585144, + "num_tokens": 343949783.0, + "step": 9018 + }, + { + "epoch": 1.1473095026078106, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9233407974243164, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8452147245407104, + "num_tokens": 343991058.0, + "step": 9019 + }, + { + "epoch": 1.1474367128864011, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7886593341827393, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8646202683448792, + "num_tokens": 344030424.0, + "step": 9020 + }, + { + "epoch": 1.1475639231649917, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7510279417037964, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8640252351760864, + "num_tokens": 344071385.0, + "step": 9021 + }, + { + "epoch": 1.1476911334435822, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0342466831207275, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8722923398017883, + "num_tokens": 344109725.0, + "step": 9022 + }, + { + "epoch": 1.1478183437221727, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8795756101608276, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8712395429611206, + "num_tokens": 344143947.0, + "step": 9023 + }, + { + "epoch": 1.1479455540007633, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.125133991241455, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8528503179550171, + "num_tokens": 344179969.0, + "step": 9024 + }, + { + "epoch": 1.1480727642793538, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.014312982559204, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8751829862594604, + "num_tokens": 344212279.0, + "step": 9025 + }, + { + "epoch": 1.1481999745579443, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.2421112060546875, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8698872327804565, + "num_tokens": 344248402.0, + "step": 9026 + }, + { + "epoch": 1.1483271848365348, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.0659518241882324, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8605858087539673, + "num_tokens": 344286557.0, + "step": 9027 + }, + { + "epoch": 1.1484543951151254, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.7879173755645752, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8636295795440674, + "num_tokens": 344331014.0, + "step": 9028 + }, + { + "epoch": 1.148581605393716, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9764673709869385, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.859615683555603, + "num_tokens": 344373239.0, + "step": 9029 + }, + { + "epoch": 1.1487088156723062, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.7444747686386108, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8783942461013794, + "num_tokens": 344409871.0, + "step": 9030 + }, + { + "epoch": 1.1488360259508967, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.956430196762085, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8541803956031799, + "num_tokens": 344452108.0, + "step": 9031 + }, + { + "epoch": 1.1489632362294873, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.178581714630127, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8578540086746216, + "num_tokens": 344489798.0, + "step": 9032 + }, + { + "epoch": 1.1490904465080778, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9933050870895386, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8716686367988586, + "num_tokens": 344528194.0, + "step": 9033 + }, + { + "epoch": 1.1492176567866683, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1322617530822754, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8482279181480408, + "num_tokens": 344559699.0, + "step": 9034 + }, + { + "epoch": 1.1493448670652588, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8781468868255615, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8561629056930542, + "num_tokens": 344598234.0, + "step": 9035 + }, + { + "epoch": 1.1494720773438494, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8573414087295532, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8556432127952576, + "num_tokens": 344634780.0, + "step": 9036 + }, + { + "epoch": 1.14959928762244, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9573445320129395, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8675908446311951, + "num_tokens": 344677167.0, + "step": 9037 + }, + { + "epoch": 1.1497264979010304, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1774134635925293, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8502458333969116, + "num_tokens": 344713824.0, + "step": 9038 + }, + { + "epoch": 1.149853708179621, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.957182765007019, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8508284687995911, + "num_tokens": 344755814.0, + "step": 9039 + }, + { + "epoch": 1.1499809184582115, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.750006914138794, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8788785338401794, + "num_tokens": 344795022.0, + "step": 9040 + }, + { + "epoch": 1.150108128736802, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8775800466537476, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.871858537197113, + "num_tokens": 344833906.0, + "step": 9041 + }, + { + "epoch": 1.1502353390153925, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9092282056808472, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8670426607131958, + "num_tokens": 344875975.0, + "step": 9042 + }, + { + "epoch": 1.150362549293983, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.80905282497406, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8669649362564087, + "num_tokens": 344915433.0, + "step": 9043 + }, + { + "epoch": 1.1504897595725734, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9455400705337524, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8677315711975098, + "num_tokens": 344949038.0, + "step": 9044 + }, + { + "epoch": 1.150616969851164, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8477882146835327, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8649922609329224, + "num_tokens": 344988260.0, + "step": 9045 + }, + { + "epoch": 1.1507441801297544, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.897842526435852, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8762171268463135, + "num_tokens": 345024355.0, + "step": 9046 + }, + { + "epoch": 1.150871390408345, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8217226266860962, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8723951578140259, + "num_tokens": 345063041.0, + "step": 9047 + }, + { + "epoch": 1.1509986006869355, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.835688591003418, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8824522495269775, + "num_tokens": 345097813.0, + "step": 9048 + }, + { + "epoch": 1.151125810965526, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.0638532638549805, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8586560487747192, + "num_tokens": 345134534.0, + "step": 9049 + }, + { + "epoch": 1.1512530212441165, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.948250412940979, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.86656254529953, + "num_tokens": 345171018.0, + "step": 9050 + }, + { + "epoch": 1.151380231522707, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 16.607250213623047, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8747919797897339, + "num_tokens": 345208006.0, + "step": 9051 + }, + { + "epoch": 1.1515074418012976, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.009439468383789, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8812692165374756, + "num_tokens": 345246616.0, + "step": 9052 + }, + { + "epoch": 1.1516346520798881, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.994888186454773, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8788014650344849, + "num_tokens": 345288095.0, + "step": 9053 + }, + { + "epoch": 1.1517618623584787, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7780768871307373, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.849376916885376, + "num_tokens": 345333291.0, + "step": 9054 + }, + { + "epoch": 1.151889072637069, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.7247546911239624, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8624229431152344, + "num_tokens": 345375093.0, + "step": 9055 + }, + { + "epoch": 1.1520162829156595, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.0373921394348145, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8591495752334595, + "num_tokens": 345415547.0, + "step": 9056 + }, + { + "epoch": 1.15214349319425, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.03393816947937, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8685083389282227, + "num_tokens": 345457340.0, + "step": 9057 + }, + { + "epoch": 1.1522707034728406, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9357060194015503, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8590057492256165, + "num_tokens": 345494711.0, + "step": 9058 + }, + { + "epoch": 1.152397913751431, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.915837287902832, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8555753231048584, + "num_tokens": 345534249.0, + "step": 9059 + }, + { + "epoch": 1.1525251240300216, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9374322891235352, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8627657294273376, + "num_tokens": 345570832.0, + "step": 9060 + }, + { + "epoch": 1.1526523343086121, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.006910800933838, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8584116697311401, + "num_tokens": 345609007.0, + "step": 9061 + }, + { + "epoch": 1.1527795445872027, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.380054473876953, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8750969171524048, + "num_tokens": 345647979.0, + "step": 9062 + }, + { + "epoch": 1.1529067548657932, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.072511672973633, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8558379411697388, + "num_tokens": 345681821.0, + "step": 9063 + }, + { + "epoch": 1.1530339651443837, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8541730642318726, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8544174432754517, + "num_tokens": 345724274.0, + "step": 9064 + }, + { + "epoch": 1.1531611754229742, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.7235440015792847, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8804072141647339, + "num_tokens": 345763558.0, + "step": 9065 + }, + { + "epoch": 1.1532883857015648, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9884560108184814, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8582884073257446, + "num_tokens": 345798486.0, + "step": 9066 + }, + { + "epoch": 1.1534155959801553, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.8136968612670898, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8613542318344116, + "num_tokens": 345839379.0, + "step": 9067 + }, + { + "epoch": 1.1535428062587456, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 1.945263147354126, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8590309619903564, + "num_tokens": 345876675.0, + "step": 9068 + }, + { + "epoch": 1.1536700165373361, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.7927141189575195, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8819240927696228, + "num_tokens": 345913482.0, + "step": 9069 + }, + { + "epoch": 1.1537972268159267, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9600000381469727, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8591223955154419, + "num_tokens": 345947671.0, + "step": 9070 + }, + { + "epoch": 1.1539244370945172, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.880416750907898, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8661829233169556, + "num_tokens": 345981520.0, + "step": 9071 + }, + { + "epoch": 1.1540516473731077, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.2609965801239014, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8461035490036011, + "num_tokens": 346020050.0, + "step": 9072 + }, + { + "epoch": 1.1541788576516983, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1917402744293213, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8538311719894409, + "num_tokens": 346050000.0, + "step": 9073 + }, + { + "epoch": 1.1543060679302888, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0427749156951904, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8587020039558411, + "num_tokens": 346085038.0, + "step": 9074 + }, + { + "epoch": 1.1544332782088793, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.755313754081726, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8742125034332275, + "num_tokens": 346125871.0, + "step": 9075 + }, + { + "epoch": 1.1545604884874698, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8657996654510498, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8720155954360962, + "num_tokens": 346160460.0, + "step": 9076 + }, + { + "epoch": 1.1546876987660604, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.967942476272583, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.853787362575531, + "num_tokens": 346201621.0, + "step": 9077 + }, + { + "epoch": 1.154814909044651, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9599545001983643, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8659077882766724, + "num_tokens": 346242680.0, + "step": 9078 + }, + { + "epoch": 1.1549421193232412, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.323826789855957, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8541566133499146, + "num_tokens": 346275774.0, + "step": 9079 + }, + { + "epoch": 1.1550693296018317, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7796486616134644, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8657265305519104, + "num_tokens": 346317648.0, + "step": 9080 + }, + { + "epoch": 1.1551965398804223, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8744451999664307, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8664054870605469, + "num_tokens": 346357736.0, + "step": 9081 + }, + { + "epoch": 1.1553237501590128, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7122730016708374, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8618525266647339, + "num_tokens": 346397961.0, + "step": 9082 + }, + { + "epoch": 1.1554509604376033, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.054468870162964, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8753670454025269, + "num_tokens": 346438268.0, + "step": 9083 + }, + { + "epoch": 1.1555781707161938, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.2918083667755127, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8541726469993591, + "num_tokens": 346479024.0, + "step": 9084 + }, + { + "epoch": 1.1557053809947844, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8007768392562866, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8695018887519836, + "num_tokens": 346523526.0, + "step": 9085 + }, + { + "epoch": 1.155832591273375, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8879555463790894, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8715192079544067, + "num_tokens": 346561985.0, + "step": 9086 + }, + { + "epoch": 1.1559598015519654, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.816020131111145, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8503427505493164, + "num_tokens": 346605600.0, + "step": 9087 + }, + { + "epoch": 1.156087011830556, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.889593243598938, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8771609663963318, + "num_tokens": 346640798.0, + "step": 9088 + }, + { + "epoch": 1.1562142221091465, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.809902310371399, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8593569993972778, + "num_tokens": 346678483.0, + "step": 9089 + }, + { + "epoch": 1.156341432387737, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.7985895872116089, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8806215524673462, + "num_tokens": 346714604.0, + "step": 9090 + }, + { + "epoch": 1.1564686426663275, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.956964373588562, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8747403621673584, + "num_tokens": 346747203.0, + "step": 9091 + }, + { + "epoch": 1.156595852944918, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.808782935142517, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8627086877822876, + "num_tokens": 346790650.0, + "step": 9092 + }, + { + "epoch": 1.1567230632235084, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9967412948608398, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8853799104690552, + "num_tokens": 346822872.0, + "step": 9093 + }, + { + "epoch": 1.156850273502099, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.9606833457946777, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8701437711715698, + "num_tokens": 346857999.0, + "step": 9094 + }, + { + "epoch": 1.1569774837806894, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.0473859310150146, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8716586232185364, + "num_tokens": 346891804.0, + "step": 9095 + }, + { + "epoch": 1.15710469405928, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9666249752044678, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8622286319732666, + "num_tokens": 346931554.0, + "step": 9096 + }, + { + "epoch": 1.1572319043378705, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.061655282974243, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8512330651283264, + "num_tokens": 346965214.0, + "step": 9097 + }, + { + "epoch": 1.157359114616461, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.999481439590454, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8749892115592957, + "num_tokens": 347001219.0, + "step": 9098 + }, + { + "epoch": 1.1574863248950515, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9176405668258667, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8462767004966736, + "num_tokens": 347043212.0, + "step": 9099 + }, + { + "epoch": 1.157613535173642, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.00768780708313, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8765227198600769, + "num_tokens": 347082249.0, + "step": 9100 + }, + { + "epoch": 1.1577407454522326, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9461283683776855, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8610955476760864, + "num_tokens": 347119286.0, + "step": 9101 + }, + { + "epoch": 1.1578679557308231, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8328458070755005, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8769251108169556, + "num_tokens": 347155549.0, + "step": 9102 + }, + { + "epoch": 1.1579951660094137, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1464056968688965, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.866605281829834, + "num_tokens": 347196846.0, + "step": 9103 + }, + { + "epoch": 1.158122376288004, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7140610218048096, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8781262636184692, + "num_tokens": 347234028.0, + "step": 9104 + }, + { + "epoch": 1.1582495865665945, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9110685586929321, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8753612041473389, + "num_tokens": 347276931.0, + "step": 9105 + }, + { + "epoch": 1.158376796845185, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9299209117889404, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8603726625442505, + "num_tokens": 347317139.0, + "step": 9106 + }, + { + "epoch": 1.1585040071237755, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9504330158233643, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.866127073764801, + "num_tokens": 347355718.0, + "step": 9107 + }, + { + "epoch": 1.158631217402366, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9662306308746338, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8537492752075195, + "num_tokens": 347393403.0, + "step": 9108 + }, + { + "epoch": 1.1587584276809566, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1258914470672607, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8703861832618713, + "num_tokens": 347433736.0, + "step": 9109 + }, + { + "epoch": 1.1588856379595471, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8328359127044678, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8662329912185669, + "num_tokens": 347474574.0, + "step": 9110 + }, + { + "epoch": 1.1590128482381377, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7101162672042847, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8705506324768066, + "num_tokens": 347518785.0, + "step": 9111 + }, + { + "epoch": 1.1591400585167282, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0058412551879883, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8525956273078918, + "num_tokens": 347557518.0, + "step": 9112 + }, + { + "epoch": 1.1592672687953187, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.959802508354187, + "learning_rate": 1e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.8346157670021057, + "num_tokens": 347600094.0, + "step": 9113 + }, + { + "epoch": 1.1593944790739092, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.0157604217529297, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.863628625869751, + "num_tokens": 347633709.0, + "step": 9114 + }, + { + "epoch": 1.1595216893524998, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8112481832504272, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8712162375450134, + "num_tokens": 347670027.0, + "step": 9115 + }, + { + "epoch": 1.1596488996310903, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8304651975631714, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8693374395370483, + "num_tokens": 347707488.0, + "step": 9116 + }, + { + "epoch": 1.1597761099096806, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8001058101654053, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8599697947502136, + "num_tokens": 347746030.0, + "step": 9117 + }, + { + "epoch": 1.1599033201882711, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.868070363998413, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8576698899269104, + "num_tokens": 347783409.0, + "step": 9118 + }, + { + "epoch": 1.1600305304668617, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.0275912284851074, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.872202455997467, + "num_tokens": 347821335.0, + "step": 9119 + }, + { + "epoch": 1.1601577407454522, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.0702900886535645, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8753414750099182, + "num_tokens": 347857590.0, + "step": 9120 + }, + { + "epoch": 1.1602849510240427, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9412633180618286, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8766903281211853, + "num_tokens": 347897000.0, + "step": 9121 + }, + { + "epoch": 1.1604121613026332, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9624106884002686, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8649097681045532, + "num_tokens": 347928947.0, + "step": 9122 + }, + { + "epoch": 1.1605393715812238, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.108858823776245, + "learning_rate": 1e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.8343375325202942, + "num_tokens": 347959294.0, + "step": 9123 + }, + { + "epoch": 1.1606665818598143, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9491685628890991, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8710432052612305, + "num_tokens": 347993103.0, + "step": 9124 + }, + { + "epoch": 1.1607937921384048, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.0165586471557617, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.876477062702179, + "num_tokens": 348028412.0, + "step": 9125 + }, + { + "epoch": 1.1609210024169954, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9230843782424927, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8468883037567139, + "num_tokens": 348071429.0, + "step": 9126 + }, + { + "epoch": 1.161048212695586, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9697495698928833, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8688290119171143, + "num_tokens": 348113222.0, + "step": 9127 + }, + { + "epoch": 1.1611754229741762, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.0089192390441895, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8637369871139526, + "num_tokens": 348150188.0, + "step": 9128 + }, + { + "epoch": 1.1613026332527667, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7650431394577026, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8678748607635498, + "num_tokens": 348188547.0, + "step": 9129 + }, + { + "epoch": 1.1614298435313573, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.95797598361969, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8596019744873047, + "num_tokens": 348228624.0, + "step": 9130 + }, + { + "epoch": 1.1615570538099478, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.0196890830993652, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8619969487190247, + "num_tokens": 348264932.0, + "step": 9131 + }, + { + "epoch": 1.1616842640885383, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.816909909248352, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8664076328277588, + "num_tokens": 348307680.0, + "step": 9132 + }, + { + "epoch": 1.1618114743671288, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9916527271270752, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8785121440887451, + "num_tokens": 348346449.0, + "step": 9133 + }, + { + "epoch": 1.1619386846457194, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.7537407875061035, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8743573427200317, + "num_tokens": 348386393.0, + "step": 9134 + }, + { + "epoch": 1.16206589492431, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.8410797119140625, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8578574657440186, + "num_tokens": 348429458.0, + "step": 9135 + }, + { + "epoch": 1.1621931052029004, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.019925594329834, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8516203165054321, + "num_tokens": 348471365.0, + "step": 9136 + }, + { + "epoch": 1.162320315481491, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9948418140411377, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8482626676559448, + "num_tokens": 348513425.0, + "step": 9137 + }, + { + "epoch": 1.1624475257600815, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.0187594890594482, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8618170022964478, + "num_tokens": 348551181.0, + "step": 9138 + }, + { + "epoch": 1.162574736038672, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9108625650405884, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8855767250061035, + "num_tokens": 348583611.0, + "step": 9139 + }, + { + "epoch": 1.1627019463172625, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.8381168842315674, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.862956166267395, + "num_tokens": 348624281.0, + "step": 9140 + }, + { + "epoch": 1.162829156595853, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9422482252120972, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8634153604507446, + "num_tokens": 348661416.0, + "step": 9141 + }, + { + "epoch": 1.1629563668744434, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.9732706546783447, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.843085527420044, + "num_tokens": 348703907.0, + "step": 9142 + }, + { + "epoch": 1.163083577153034, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.9348187446594238, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8759869337081909, + "num_tokens": 348735617.0, + "step": 9143 + }, + { + "epoch": 1.1632107874316244, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8601423501968384, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8619484901428223, + "num_tokens": 348773552.0, + "step": 9144 + }, + { + "epoch": 1.163337997710215, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.919293761253357, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8657264113426208, + "num_tokens": 348811274.0, + "step": 9145 + }, + { + "epoch": 1.1634652079888055, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9248664379119873, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8530722856521606, + "num_tokens": 348847773.0, + "step": 9146 + }, + { + "epoch": 1.163592418267396, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9331704378128052, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8561422824859619, + "num_tokens": 348886356.0, + "step": 9147 + }, + { + "epoch": 1.1637196285459865, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.91338312625885, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8760755062103271, + "num_tokens": 348922892.0, + "step": 9148 + }, + { + "epoch": 1.163846838824577, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9485126733779907, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8827760219573975, + "num_tokens": 348954924.0, + "step": 9149 + }, + { + "epoch": 1.1639740491031676, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8621363639831543, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8698848485946655, + "num_tokens": 348997335.0, + "step": 9150 + }, + { + "epoch": 1.1641012593817581, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.0007681846618652, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8530574440956116, + "num_tokens": 349038221.0, + "step": 9151 + }, + { + "epoch": 1.1642284696603487, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.987924337387085, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8591283559799194, + "num_tokens": 349076559.0, + "step": 9152 + }, + { + "epoch": 1.164355679938939, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.032229423522949, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8424060344696045, + "num_tokens": 349112264.0, + "step": 9153 + }, + { + "epoch": 1.1644828902175295, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9067023992538452, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8762676119804382, + "num_tokens": 349146571.0, + "step": 9154 + }, + { + "epoch": 1.16461010049612, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.0883758068084717, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8707439303398132, + "num_tokens": 349183674.0, + "step": 9155 + }, + { + "epoch": 1.1647373107747105, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9759058952331543, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8693552017211914, + "num_tokens": 349227042.0, + "step": 9156 + }, + { + "epoch": 1.164864521053301, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9395321607589722, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8592807054519653, + "num_tokens": 349264072.0, + "step": 9157 + }, + { + "epoch": 1.1649917313318916, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.891741394996643, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8656631112098694, + "num_tokens": 349304427.0, + "step": 9158 + }, + { + "epoch": 1.1651189416104821, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8121477365493774, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8625648021697998, + "num_tokens": 349346708.0, + "step": 9159 + }, + { + "epoch": 1.1652461518890727, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7929397821426392, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8631833791732788, + "num_tokens": 349387716.0, + "step": 9160 + }, + { + "epoch": 1.1653733621676632, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8132935762405396, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8608869910240173, + "num_tokens": 349425757.0, + "step": 9161 + }, + { + "epoch": 1.1655005724462537, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9522196054458618, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8711956739425659, + "num_tokens": 349472097.0, + "step": 9162 + }, + { + "epoch": 1.1656277827248442, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9100371599197388, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8615480065345764, + "num_tokens": 349513809.0, + "step": 9163 + }, + { + "epoch": 1.1657549930034348, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9359666109085083, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8789000511169434, + "num_tokens": 349544906.0, + "step": 9164 + }, + { + "epoch": 1.1658822032820253, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.903982162475586, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8594988584518433, + "num_tokens": 349582529.0, + "step": 9165 + }, + { + "epoch": 1.1660094135606156, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8629698753356934, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8604267835617065, + "num_tokens": 349624871.0, + "step": 9166 + }, + { + "epoch": 1.1661366238392061, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9562371969223022, + "learning_rate": 1e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.829441487789154, + "num_tokens": 349660783.0, + "step": 9167 + }, + { + "epoch": 1.1662638341177967, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9139503240585327, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8742648363113403, + "num_tokens": 349698430.0, + "step": 9168 + }, + { + "epoch": 1.1663910443963872, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9743880033493042, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8562726378440857, + "num_tokens": 349734981.0, + "step": 9169 + }, + { + "epoch": 1.1665182546749777, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9717986583709717, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8578689694404602, + "num_tokens": 349774267.0, + "step": 9170 + }, + { + "epoch": 1.1666454649535682, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8523848056793213, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.862522304058075, + "num_tokens": 349811400.0, + "step": 9171 + }, + { + "epoch": 1.1667726752321588, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7759877443313599, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8558707237243652, + "num_tokens": 349850031.0, + "step": 9172 + }, + { + "epoch": 1.1668998855107493, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8587695360183716, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8666361570358276, + "num_tokens": 349889967.0, + "step": 9173 + }, + { + "epoch": 1.1670270957893398, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.1808390617370605, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8577811121940613, + "num_tokens": 349930276.0, + "step": 9174 + }, + { + "epoch": 1.1671543060679304, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8532946109771729, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8604388236999512, + "num_tokens": 349969889.0, + "step": 9175 + }, + { + "epoch": 1.1672815163465209, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9477773904800415, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8625597357749939, + "num_tokens": 350005729.0, + "step": 9176 + }, + { + "epoch": 1.1674087266251112, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7607898712158203, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8712191581726074, + "num_tokens": 350043130.0, + "step": 9177 + }, + { + "epoch": 1.1675359369037017, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9338040351867676, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8824913501739502, + "num_tokens": 350081233.0, + "step": 9178 + }, + { + "epoch": 1.1676631471822922, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9738237857818604, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8500754833221436, + "num_tokens": 350119403.0, + "step": 9179 + }, + { + "epoch": 1.1677903574608828, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.083073616027832, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8527994751930237, + "num_tokens": 350159099.0, + "step": 9180 + }, + { + "epoch": 1.1679175677394733, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0168755054473877, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8637818098068237, + "num_tokens": 350195642.0, + "step": 9181 + }, + { + "epoch": 1.1680447780180638, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8031933307647705, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8712823390960693, + "num_tokens": 350236721.0, + "step": 9182 + }, + { + "epoch": 1.1681719882966544, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7836326360702515, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8590190410614014, + "num_tokens": 350272691.0, + "step": 9183 + }, + { + "epoch": 1.168299198575245, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.736711025238037, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8624143004417419, + "num_tokens": 350318299.0, + "step": 9184 + }, + { + "epoch": 1.1684264088538354, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9388724565505981, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8634593486785889, + "num_tokens": 350360973.0, + "step": 9185 + }, + { + "epoch": 1.168553619132426, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8298346996307373, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8713566064834595, + "num_tokens": 350400516.0, + "step": 9186 + }, + { + "epoch": 1.1686808294110165, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8329418897628784, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8601984977722168, + "num_tokens": 350438147.0, + "step": 9187 + }, + { + "epoch": 1.168808039689607, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7866703271865845, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8676908016204834, + "num_tokens": 350481604.0, + "step": 9188 + }, + { + "epoch": 1.1689352499681975, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.914867877960205, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8832483291625977, + "num_tokens": 350518225.0, + "step": 9189 + }, + { + "epoch": 1.169062460246788, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9953094720840454, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.863465428352356, + "num_tokens": 350553752.0, + "step": 9190 + }, + { + "epoch": 1.1691896705253784, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.916013479232788, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8690084218978882, + "num_tokens": 350593676.0, + "step": 9191 + }, + { + "epoch": 1.169316880803969, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.917127013206482, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.863621711730957, + "num_tokens": 350633618.0, + "step": 9192 + }, + { + "epoch": 1.1694440910825594, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7743881940841675, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8629449605941772, + "num_tokens": 350678221.0, + "step": 9193 + }, + { + "epoch": 1.16957130136115, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.838631510734558, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8743546605110168, + "num_tokens": 350718258.0, + "step": 9194 + }, + { + "epoch": 1.1696985116397405, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8630857467651367, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8740096092224121, + "num_tokens": 350757370.0, + "step": 9195 + }, + { + "epoch": 1.169825721918331, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 6.3835554122924805, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8646153211593628, + "num_tokens": 350801394.0, + "step": 9196 + }, + { + "epoch": 1.1699529321969215, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.116676092147827, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8568711280822754, + "num_tokens": 350840635.0, + "step": 9197 + }, + { + "epoch": 1.170080142475512, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8608952760696411, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8796480894088745, + "num_tokens": 350879616.0, + "step": 9198 + }, + { + "epoch": 1.1702073527541026, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.91121244430542, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8645138740539551, + "num_tokens": 350921710.0, + "step": 9199 + }, + { + "epoch": 1.1703345630326931, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7675480842590332, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8523848056793213, + "num_tokens": 350964218.0, + "step": 9200 + }, + { + "epoch": 1.1704617733112836, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.116753578186035, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8729240298271179, + "num_tokens": 351004511.0, + "step": 9201 + }, + { + "epoch": 1.170588983589874, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.1823673248291016, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8596978187561035, + "num_tokens": 351044583.0, + "step": 9202 + }, + { + "epoch": 1.1707161938684645, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8888850212097168, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8485468626022339, + "num_tokens": 351083613.0, + "step": 9203 + }, + { + "epoch": 1.170843404147055, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9256609678268433, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8574930429458618, + "num_tokens": 351117412.0, + "step": 9204 + }, + { + "epoch": 1.1709706144256455, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.867662787437439, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.848707377910614, + "num_tokens": 351154779.0, + "step": 9205 + }, + { + "epoch": 1.171097824704236, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.0053906440734863, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8746580481529236, + "num_tokens": 351191412.0, + "step": 9206 + }, + { + "epoch": 1.1712250349828266, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7399920225143433, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8674392700195312, + "num_tokens": 351235465.0, + "step": 9207 + }, + { + "epoch": 1.1713522452614171, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7973448038101196, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8695934414863586, + "num_tokens": 351279326.0, + "step": 9208 + }, + { + "epoch": 1.1714794555400077, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8261418342590332, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8818042278289795, + "num_tokens": 351316678.0, + "step": 9209 + }, + { + "epoch": 1.1716066658185982, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 80.53202056884766, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.860080361366272, + "num_tokens": 351352641.0, + "step": 9210 + }, + { + "epoch": 1.1717338760971887, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.5663304328918457, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8555587530136108, + "num_tokens": 351385845.0, + "step": 9211 + }, + { + "epoch": 1.1718610863757792, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.933801531791687, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.87093186378479, + "num_tokens": 351425220.0, + "step": 9212 + }, + { + "epoch": 1.1719882966543698, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8966401815414429, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.858253538608551, + "num_tokens": 351467515.0, + "step": 9213 + }, + { + "epoch": 1.1721155069329603, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9406882524490356, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8622493147850037, + "num_tokens": 351507571.0, + "step": 9214 + }, + { + "epoch": 1.1722427172115506, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8490115404129028, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8616140484809875, + "num_tokens": 351546150.0, + "step": 9215 + }, + { + "epoch": 1.1723699274901411, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8494236469268799, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8571692705154419, + "num_tokens": 351588574.0, + "step": 9216 + }, + { + "epoch": 1.1724971377687317, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9544274806976318, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8649653792381287, + "num_tokens": 351626914.0, + "step": 9217 + }, + { + "epoch": 1.1726243480473222, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8477498292922974, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8541585206985474, + "num_tokens": 351663574.0, + "step": 9218 + }, + { + "epoch": 1.1727515583259127, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8767762184143066, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8468431234359741, + "num_tokens": 351703275.0, + "step": 9219 + }, + { + "epoch": 1.1728787686045032, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.874642014503479, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.854031503200531, + "num_tokens": 351745713.0, + "step": 9220 + }, + { + "epoch": 1.1730059788830938, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8877254724502563, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.850023627281189, + "num_tokens": 351780789.0, + "step": 9221 + }, + { + "epoch": 1.1731331891616843, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8703458309173584, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8741263151168823, + "num_tokens": 351820352.0, + "step": 9222 + }, + { + "epoch": 1.1732603994402748, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8279153108596802, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8716423511505127, + "num_tokens": 351860011.0, + "step": 9223 + }, + { + "epoch": 1.1733876097188654, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 3.3851001262664795, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8562475442886353, + "num_tokens": 351898742.0, + "step": 9224 + }, + { + "epoch": 1.1735148199974559, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9831454753875732, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8516733646392822, + "num_tokens": 351940201.0, + "step": 9225 + }, + { + "epoch": 1.1736420302760462, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.095130681991577, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.862250566482544, + "num_tokens": 351971041.0, + "step": 9226 + }, + { + "epoch": 1.1737692405546367, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.856224536895752, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8462653756141663, + "num_tokens": 352013002.0, + "step": 9227 + }, + { + "epoch": 1.1738964508332272, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8550500869750977, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8617388010025024, + "num_tokens": 352051435.0, + "step": 9228 + }, + { + "epoch": 1.1740236611118178, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8496705293655396, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8689563274383545, + "num_tokens": 352093295.0, + "step": 9229 + }, + { + "epoch": 1.1741508713904083, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7344152927398682, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8585423231124878, + "num_tokens": 352136401.0, + "step": 9230 + }, + { + "epoch": 1.1742780816689988, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7181823253631592, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8743208646774292, + "num_tokens": 352178696.0, + "step": 9231 + }, + { + "epoch": 1.1744052919475894, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.1494877338409424, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8445541262626648, + "num_tokens": 352214759.0, + "step": 9232 + }, + { + "epoch": 1.1745325022261799, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9862605333328247, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8767470717430115, + "num_tokens": 352250520.0, + "step": 9233 + }, + { + "epoch": 1.1746597125047704, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8483041524887085, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8609676361083984, + "num_tokens": 352288773.0, + "step": 9234 + }, + { + "epoch": 1.174786922783361, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7425086498260498, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8712528944015503, + "num_tokens": 352326548.0, + "step": 9235 + }, + { + "epoch": 1.1749141330619515, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.84260892868042, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8732442855834961, + "num_tokens": 352366537.0, + "step": 9236 + }, + { + "epoch": 1.175041343340542, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.0704026222229004, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8467206954956055, + "num_tokens": 352402390.0, + "step": 9237 + }, + { + "epoch": 1.1751685536191325, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.126227855682373, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8604629039764404, + "num_tokens": 352442357.0, + "step": 9238 + }, + { + "epoch": 1.175295763897723, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9627958536148071, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8611667156219482, + "num_tokens": 352476884.0, + "step": 9239 + }, + { + "epoch": 1.1754229741763134, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.9058804512023926, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8642114400863647, + "num_tokens": 352513448.0, + "step": 9240 + }, + { + "epoch": 1.175550184454904, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8631727695465088, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8840537071228027, + "num_tokens": 352548420.0, + "step": 9241 + }, + { + "epoch": 1.1756773947334944, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8211915493011475, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8669326305389404, + "num_tokens": 352592157.0, + "step": 9242 + }, + { + "epoch": 1.175804605012085, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.036007881164551, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8559080958366394, + "num_tokens": 352626427.0, + "step": 9243 + }, + { + "epoch": 1.1759318152906755, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.7369294166564941, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8736238479614258, + "num_tokens": 352667075.0, + "step": 9244 + }, + { + "epoch": 1.176059025569266, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8621938228607178, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8612983822822571, + "num_tokens": 352707114.0, + "step": 9245 + }, + { + "epoch": 1.1761862358478565, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.0925092697143555, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8695851564407349, + "num_tokens": 352738064.0, + "step": 9246 + }, + { + "epoch": 1.176313446126447, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0053443908691406, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8488765954971313, + "num_tokens": 352779964.0, + "step": 9247 + }, + { + "epoch": 1.1764406564050376, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0025484561920166, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8719503879547119, + "num_tokens": 352819574.0, + "step": 9248 + }, + { + "epoch": 1.1765678666836281, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 3.9253647327423096, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8729349970817566, + "num_tokens": 352859151.0, + "step": 9249 + }, + { + "epoch": 1.1766950769622184, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.95838463306427, + "learning_rate": 1e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8421354293823242, + "num_tokens": 352901119.0, + "step": 9250 + }, + { + "epoch": 1.176822287240809, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8122323751449585, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8614951372146606, + "num_tokens": 352945878.0, + "step": 9251 + }, + { + "epoch": 1.1769494975193995, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.815014123916626, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8581050634384155, + "num_tokens": 352979819.0, + "step": 9252 + }, + { + "epoch": 1.17707670779799, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7583839893341064, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8528674840927124, + "num_tokens": 353023552.0, + "step": 9253 + }, + { + "epoch": 1.1772039180765805, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.850111484527588, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8643885850906372, + "num_tokens": 353058178.0, + "step": 9254 + }, + { + "epoch": 1.177331128355171, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.044734001159668, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8728423714637756, + "num_tokens": 353091428.0, + "step": 9255 + }, + { + "epoch": 1.1774583386337616, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7091299295425415, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8678378462791443, + "num_tokens": 353131943.0, + "step": 9256 + }, + { + "epoch": 1.1775855489123521, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9107553958892822, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8555363416671753, + "num_tokens": 353171213.0, + "step": 9257 + }, + { + "epoch": 1.1777127591909426, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.991682529449463, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8721300959587097, + "num_tokens": 353205057.0, + "step": 9258 + }, + { + "epoch": 1.1778399694695332, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8027220964431763, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8495593070983887, + "num_tokens": 353251164.0, + "step": 9259 + }, + { + "epoch": 1.1779671797481237, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8889877796173096, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8686773777008057, + "num_tokens": 353288202.0, + "step": 9260 + }, + { + "epoch": 1.1780943900267142, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9032807350158691, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8622623682022095, + "num_tokens": 353327396.0, + "step": 9261 + }, + { + "epoch": 1.1782216003053048, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0258429050445557, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8522007465362549, + "num_tokens": 353366512.0, + "step": 9262 + }, + { + "epoch": 1.1783488105838953, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.995895504951477, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8552672266960144, + "num_tokens": 353408546.0, + "step": 9263 + }, + { + "epoch": 1.1784760208624856, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8919954299926758, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8723680973052979, + "num_tokens": 353444950.0, + "step": 9264 + }, + { + "epoch": 1.1786032311410761, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.757554292678833, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8711993098258972, + "num_tokens": 353483306.0, + "step": 9265 + }, + { + "epoch": 1.1787304414196667, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8532581329345703, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8632935881614685, + "num_tokens": 353523038.0, + "step": 9266 + }, + { + "epoch": 1.1788576516982572, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9276973009109497, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8738946318626404, + "num_tokens": 353564266.0, + "step": 9267 + }, + { + "epoch": 1.1789848619768477, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8343665599822998, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8660140037536621, + "num_tokens": 353603558.0, + "step": 9268 + }, + { + "epoch": 1.1791120722554382, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7588001489639282, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8709713220596313, + "num_tokens": 353648097.0, + "step": 9269 + }, + { + "epoch": 1.1792392825340288, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8022915124893188, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8488959074020386, + "num_tokens": 353688970.0, + "step": 9270 + }, + { + "epoch": 1.1793664928126193, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.845697283744812, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8529372215270996, + "num_tokens": 353732505.0, + "step": 9271 + }, + { + "epoch": 1.1794937030912098, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8392248153686523, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8715887069702148, + "num_tokens": 353767460.0, + "step": 9272 + }, + { + "epoch": 1.1796209133698004, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.1098713874816895, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8724974989891052, + "num_tokens": 353801951.0, + "step": 9273 + }, + { + "epoch": 1.1797481236483909, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 16.62325668334961, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8704804182052612, + "num_tokens": 353836815.0, + "step": 9274 + }, + { + "epoch": 1.1798753339269812, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.177943468093872, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8566461801528931, + "num_tokens": 353876296.0, + "step": 9275 + }, + { + "epoch": 1.1800025442055717, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.3800885677337646, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8461951613426208, + "num_tokens": 353911792.0, + "step": 9276 + }, + { + "epoch": 1.1801297544841622, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0211105346679688, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8531016707420349, + "num_tokens": 353949532.0, + "step": 9277 + }, + { + "epoch": 1.1802569647627528, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.1209757328033447, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8501654863357544, + "num_tokens": 353988622.0, + "step": 9278 + }, + { + "epoch": 1.1803841750413433, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7633695602416992, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8666672706604004, + "num_tokens": 354028136.0, + "step": 9279 + }, + { + "epoch": 1.1805113853199338, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.045440435409546, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8637941479682922, + "num_tokens": 354065029.0, + "step": 9280 + }, + { + "epoch": 1.1806385955985244, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0199131965637207, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8745561242103577, + "num_tokens": 354099279.0, + "step": 9281 + }, + { + "epoch": 1.1807658058771149, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8335318565368652, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8581070303916931, + "num_tokens": 354138870.0, + "step": 9282 + }, + { + "epoch": 1.1808930161557054, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9187904596328735, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8510505557060242, + "num_tokens": 354176498.0, + "step": 9283 + }, + { + "epoch": 1.181020226434296, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9176796674728394, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8446168899536133, + "num_tokens": 354210063.0, + "step": 9284 + }, + { + "epoch": 1.1811474367128865, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9167131185531616, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8566656112670898, + "num_tokens": 354248095.0, + "step": 9285 + }, + { + "epoch": 1.181274646991477, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.096038818359375, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8590128421783447, + "num_tokens": 354286897.0, + "step": 9286 + }, + { + "epoch": 1.1814018572700675, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.300916910171509, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8717630505561829, + "num_tokens": 354330685.0, + "step": 9287 + }, + { + "epoch": 1.181529067548658, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.875091791152954, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8547160029411316, + "num_tokens": 354371120.0, + "step": 9288 + }, + { + "epoch": 1.1816562778272484, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9073028564453125, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8606822490692139, + "num_tokens": 354410825.0, + "step": 9289 + }, + { + "epoch": 1.1817834881058389, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.777911901473999, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8600008487701416, + "num_tokens": 354453623.0, + "step": 9290 + }, + { + "epoch": 1.1819106983844294, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9056777954101562, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8615385293960571, + "num_tokens": 354491854.0, + "step": 9291 + }, + { + "epoch": 1.18203790866302, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7809269428253174, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.849215030670166, + "num_tokens": 354530788.0, + "step": 9292 + }, + { + "epoch": 1.1821651189416105, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8110450506210327, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.882933497428894, + "num_tokens": 354568576.0, + "step": 9293 + }, + { + "epoch": 1.182292329220201, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9153152704238892, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8641285300254822, + "num_tokens": 354611307.0, + "step": 9294 + }, + { + "epoch": 1.1824195394987915, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8772149085998535, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8706413507461548, + "num_tokens": 354648810.0, + "step": 9295 + }, + { + "epoch": 1.182546749777382, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7936127185821533, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8652918934822083, + "num_tokens": 354688411.0, + "step": 9296 + }, + { + "epoch": 1.1826739600559726, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.781511664390564, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8616908192634583, + "num_tokens": 354734700.0, + "step": 9297 + }, + { + "epoch": 1.1828011703345631, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.123800277709961, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8597264289855957, + "num_tokens": 354765517.0, + "step": 9298 + }, + { + "epoch": 1.1829283806131534, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.761171579360962, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8665850758552551, + "num_tokens": 354808563.0, + "step": 9299 + }, + { + "epoch": 1.183055590891744, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9727295637130737, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8451346158981323, + "num_tokens": 354846618.0, + "step": 9300 + }, + { + "epoch": 1.1831828011703345, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.966062307357788, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.864253044128418, + "num_tokens": 354881293.0, + "step": 9301 + }, + { + "epoch": 1.183310011448925, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.737142562866211, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8611773252487183, + "num_tokens": 354925239.0, + "step": 9302 + }, + { + "epoch": 1.1834372217275155, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9274979829788208, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8496639728546143, + "num_tokens": 354967836.0, + "step": 9303 + }, + { + "epoch": 1.183564432006106, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0903866291046143, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8469082713127136, + "num_tokens": 355003386.0, + "step": 9304 + }, + { + "epoch": 1.1836916422846966, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9452670812606812, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8413382172584534, + "num_tokens": 355043391.0, + "step": 9305 + }, + { + "epoch": 1.1838188525632871, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9374843835830688, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8644373416900635, + "num_tokens": 355082820.0, + "step": 9306 + }, + { + "epoch": 1.1839460628418776, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.7228541374206543, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8837532997131348, + "num_tokens": 355117531.0, + "step": 9307 + }, + { + "epoch": 1.1840732731204682, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9339922666549683, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8512943983078003, + "num_tokens": 355159485.0, + "step": 9308 + }, + { + "epoch": 1.1842004833990587, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 3.0232346057891846, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8585976362228394, + "num_tokens": 355206051.0, + "step": 9309 + }, + { + "epoch": 1.1843276936776492, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 16.612545013427734, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.85452800989151, + "num_tokens": 355242933.0, + "step": 9310 + }, + { + "epoch": 1.1844549039562398, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9411330223083496, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8748767971992493, + "num_tokens": 355286311.0, + "step": 9311 + }, + { + "epoch": 1.1845821142348303, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7802399396896362, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8623736500740051, + "num_tokens": 355327844.0, + "step": 9312 + }, + { + "epoch": 1.1847093245134206, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.6775788068771362, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.872067928314209, + "num_tokens": 355368977.0, + "step": 9313 + }, + { + "epoch": 1.1848365347920111, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.822614312171936, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8534225225448608, + "num_tokens": 355411479.0, + "step": 9314 + }, + { + "epoch": 1.1849637450706016, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8657150268554688, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.881561279296875, + "num_tokens": 355447370.0, + "step": 9315 + }, + { + "epoch": 1.1850909553491922, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7420923709869385, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8596588373184204, + "num_tokens": 355490335.0, + "step": 9316 + }, + { + "epoch": 1.1852181656277827, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8927651643753052, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8630626201629639, + "num_tokens": 355527843.0, + "step": 9317 + }, + { + "epoch": 1.1853453759063732, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.8870402574539185, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.859123945236206, + "num_tokens": 355566094.0, + "step": 9318 + }, + { + "epoch": 1.1854725861849638, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 3.2816622257232666, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8704853057861328, + "num_tokens": 355609505.0, + "step": 9319 + }, + { + "epoch": 1.1855997964635543, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0297417640686035, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8373653888702393, + "num_tokens": 355650930.0, + "step": 9320 + }, + { + "epoch": 1.1857270067421448, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0861480236053467, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8718331456184387, + "num_tokens": 355684075.0, + "step": 9321 + }, + { + "epoch": 1.1858542170207353, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8713533878326416, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8692511320114136, + "num_tokens": 355723567.0, + "step": 9322 + }, + { + "epoch": 1.1859814272993259, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9010334014892578, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8480242490768433, + "num_tokens": 355764915.0, + "step": 9323 + }, + { + "epoch": 1.1861086375779162, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9279212951660156, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8644655346870422, + "num_tokens": 355804950.0, + "step": 9324 + }, + { + "epoch": 1.1862358478565067, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.764940857887268, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8706824779510498, + "num_tokens": 355844155.0, + "step": 9325 + }, + { + "epoch": 1.1863630581350972, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.7152483463287354, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8790820837020874, + "num_tokens": 355883971.0, + "step": 9326 + }, + { + "epoch": 1.1864902684136878, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9168062210083008, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8579806089401245, + "num_tokens": 355922271.0, + "step": 9327 + }, + { + "epoch": 1.1866174786922783, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.8830357789993286, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8613122701644897, + "num_tokens": 355961087.0, + "step": 9328 + }, + { + "epoch": 1.1867446889708688, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.7015868425369263, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.866776704788208, + "num_tokens": 355999553.0, + "step": 9329 + }, + { + "epoch": 1.1868718992494594, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.880350947380066, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8676884770393372, + "num_tokens": 356038503.0, + "step": 9330 + }, + { + "epoch": 1.1869991095280499, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.8278729915618896, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8743584752082825, + "num_tokens": 356078538.0, + "step": 9331 + }, + { + "epoch": 1.1871263198066404, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.5289783477783203, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.875453531742096, + "num_tokens": 356115031.0, + "step": 9332 + }, + { + "epoch": 1.187253530085231, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.8285328149795532, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8773515224456787, + "num_tokens": 356154751.0, + "step": 9333 + }, + { + "epoch": 1.1873807403638215, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.9780138731002808, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8478828072547913, + "num_tokens": 356192105.0, + "step": 9334 + }, + { + "epoch": 1.187507950642412, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 2.00346040725708, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8515673279762268, + "num_tokens": 356224964.0, + "step": 9335 + }, + { + "epoch": 1.1876351609210025, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 20.466449737548828, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8682724833488464, + "num_tokens": 356261565.0, + "step": 9336 + }, + { + "epoch": 1.187762371199593, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0399656295776367, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8659888505935669, + "num_tokens": 356301886.0, + "step": 9337 + }, + { + "epoch": 1.1878895814781834, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.923704743385315, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8713003396987915, + "num_tokens": 356339481.0, + "step": 9338 + }, + { + "epoch": 1.1880167917567739, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8298282623291016, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8402093052864075, + "num_tokens": 356387038.0, + "step": 9339 + }, + { + "epoch": 1.1881440020353644, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.901776909828186, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8587968349456787, + "num_tokens": 356433256.0, + "step": 9340 + }, + { + "epoch": 1.188271212313955, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7541964054107666, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8526662588119507, + "num_tokens": 356471282.0, + "step": 9341 + }, + { + "epoch": 1.1883984225925455, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.811577320098877, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8685004711151123, + "num_tokens": 356508951.0, + "step": 9342 + }, + { + "epoch": 1.188525632871136, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0051028728485107, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8658608794212341, + "num_tokens": 356544487.0, + "step": 9343 + }, + { + "epoch": 1.1886528431497265, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9140775203704834, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8696335554122925, + "num_tokens": 356580873.0, + "step": 9344 + }, + { + "epoch": 1.188780053428317, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9892467260360718, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8600075840950012, + "num_tokens": 356613622.0, + "step": 9345 + }, + { + "epoch": 1.1889072637069076, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9666236639022827, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8405075073242188, + "num_tokens": 356660058.0, + "step": 9346 + }, + { + "epoch": 1.189034473985498, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8273985385894775, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8611044883728027, + "num_tokens": 356701614.0, + "step": 9347 + }, + { + "epoch": 1.1891616842640884, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8381108045578003, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8770508766174316, + "num_tokens": 356737105.0, + "step": 9348 + }, + { + "epoch": 1.189288894542679, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.88681960105896, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8536542654037476, + "num_tokens": 356779382.0, + "step": 9349 + }, + { + "epoch": 1.1894161048212695, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.034379482269287, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8580002188682556, + "num_tokens": 356813836.0, + "step": 9350 + }, + { + "epoch": 1.18954331509986, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9146636724472046, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8538703918457031, + "num_tokens": 356852591.0, + "step": 9351 + }, + { + "epoch": 1.1896705253784505, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.833719253540039, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8621325492858887, + "num_tokens": 356889940.0, + "step": 9352 + }, + { + "epoch": 1.189797735657041, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8775120973587036, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8647588491439819, + "num_tokens": 356927878.0, + "step": 9353 + }, + { + "epoch": 1.1899249459356316, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9708976745605469, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8715193867683411, + "num_tokens": 356963339.0, + "step": 9354 + }, + { + "epoch": 1.1900521562142221, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8109747171401978, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8676049709320068, + "num_tokens": 357002999.0, + "step": 9355 + }, + { + "epoch": 1.1901793664928126, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8658334016799927, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.865868866443634, + "num_tokens": 357044305.0, + "step": 9356 + }, + { + "epoch": 1.1903065767714032, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8510034084320068, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8644059300422668, + "num_tokens": 357082197.0, + "step": 9357 + }, + { + "epoch": 1.1904337870499937, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.053149700164795, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8452814221382141, + "num_tokens": 357119541.0, + "step": 9358 + }, + { + "epoch": 1.1905609973285842, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9127516746520996, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8841758966445923, + "num_tokens": 357159176.0, + "step": 9359 + }, + { + "epoch": 1.1906882076071748, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.931021809577942, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8625293970108032, + "num_tokens": 357199204.0, + "step": 9360 + }, + { + "epoch": 1.1908154178857653, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8516160249710083, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.877932071685791, + "num_tokens": 357236900.0, + "step": 9361 + }, + { + "epoch": 1.1909426281643556, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8948529958724976, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8639551401138306, + "num_tokens": 357271622.0, + "step": 9362 + }, + { + "epoch": 1.1910698384429461, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.760512113571167, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8742921352386475, + "num_tokens": 357311145.0, + "step": 9363 + }, + { + "epoch": 1.1911970487215366, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9082374572753906, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.879584789276123, + "num_tokens": 357344609.0, + "step": 9364 + }, + { + "epoch": 1.1913242590001272, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.933028221130371, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8604459166526794, + "num_tokens": 357378896.0, + "step": 9365 + }, + { + "epoch": 1.1914514692787177, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7857210636138916, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8669965863227844, + "num_tokens": 357424730.0, + "step": 9366 + }, + { + "epoch": 1.1915786795573082, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9872453212738037, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8447310924530029, + "num_tokens": 357461152.0, + "step": 9367 + }, + { + "epoch": 1.1917058898358988, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8021987676620483, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8562705516815186, + "num_tokens": 357502305.0, + "step": 9368 + }, + { + "epoch": 1.1918331001144893, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0285608768463135, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8567432165145874, + "num_tokens": 357539187.0, + "step": 9369 + }, + { + "epoch": 1.1919603103930798, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0539629459381104, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.851928174495697, + "num_tokens": 357569520.0, + "step": 9370 + }, + { + "epoch": 1.1920875206716703, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8685258626937866, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8621780276298523, + "num_tokens": 357605253.0, + "step": 9371 + }, + { + "epoch": 1.1922147309502609, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8645195960998535, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.873235821723938, + "num_tokens": 357639701.0, + "step": 9372 + }, + { + "epoch": 1.1923419412288512, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9526091814041138, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8713784217834473, + "num_tokens": 357675611.0, + "step": 9373 + }, + { + "epoch": 1.1924691515074417, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.065425157546997, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.874251127243042, + "num_tokens": 357708521.0, + "step": 9374 + }, + { + "epoch": 1.1925963617860322, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0034377574920654, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8528069853782654, + "num_tokens": 357749970.0, + "step": 9375 + }, + { + "epoch": 1.1927235720646228, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1542246341705322, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8728023767471313, + "num_tokens": 357783287.0, + "step": 9376 + }, + { + "epoch": 1.1928507823432133, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0006537437438965, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8634777069091797, + "num_tokens": 357818883.0, + "step": 9377 + }, + { + "epoch": 1.1929779926218038, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7168245315551758, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8761634826660156, + "num_tokens": 357862862.0, + "step": 9378 + }, + { + "epoch": 1.1931052029003943, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0557637214660645, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8526332974433899, + "num_tokens": 357901574.0, + "step": 9379 + }, + { + "epoch": 1.1932324131789849, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8868803977966309, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8632046580314636, + "num_tokens": 357941822.0, + "step": 9380 + }, + { + "epoch": 1.1933596234575754, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9248028993606567, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8555818796157837, + "num_tokens": 357982820.0, + "step": 9381 + }, + { + "epoch": 1.193486833736166, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9576410055160522, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8598412871360779, + "num_tokens": 358020491.0, + "step": 9382 + }, + { + "epoch": 1.1936140440147565, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7607622146606445, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8653148412704468, + "num_tokens": 358059775.0, + "step": 9383 + }, + { + "epoch": 1.193741254293347, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9180834293365479, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8707976341247559, + "num_tokens": 358098672.0, + "step": 9384 + }, + { + "epoch": 1.1938684645719375, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.941068172454834, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8564984798431396, + "num_tokens": 358140850.0, + "step": 9385 + }, + { + "epoch": 1.193995674850528, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.012876272201538, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8695043325424194, + "num_tokens": 358175503.0, + "step": 9386 + }, + { + "epoch": 1.1941228851291183, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.933467984199524, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8651735186576843, + "num_tokens": 358210615.0, + "step": 9387 + }, + { + "epoch": 1.1942500954077089, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9226458072662354, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8646466135978699, + "num_tokens": 358246605.0, + "step": 9388 + }, + { + "epoch": 1.1943773056862994, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.846767783164978, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8599469661712646, + "num_tokens": 358283136.0, + "step": 9389 + }, + { + "epoch": 1.19450451596489, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7578864097595215, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8722891807556152, + "num_tokens": 358325437.0, + "step": 9390 + }, + { + "epoch": 1.1946317262434805, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.77439284324646, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8628073930740356, + "num_tokens": 358365360.0, + "step": 9391 + }, + { + "epoch": 1.194758936522071, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7815619707107544, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8791771531105042, + "num_tokens": 358406502.0, + "step": 9392 + }, + { + "epoch": 1.1948861468006615, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.016669273376465, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8598670363426208, + "num_tokens": 358437220.0, + "step": 9393 + }, + { + "epoch": 1.195013357079252, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9888219833374023, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8506993055343628, + "num_tokens": 358475314.0, + "step": 9394 + }, + { + "epoch": 1.1951405673578426, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.748219609260559, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8729809522628784, + "num_tokens": 358520777.0, + "step": 9395 + }, + { + "epoch": 1.195267777636433, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9058252573013306, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8824982643127441, + "num_tokens": 358557973.0, + "step": 9396 + }, + { + "epoch": 1.1953949879150234, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9646339416503906, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8716622591018677, + "num_tokens": 358596795.0, + "step": 9397 + }, + { + "epoch": 1.195522198193614, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.982230305671692, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8632087707519531, + "num_tokens": 358632146.0, + "step": 9398 + }, + { + "epoch": 1.1956494084722045, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8187670707702637, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8717868328094482, + "num_tokens": 358672194.0, + "step": 9399 + }, + { + "epoch": 1.195776618750795, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9503594636917114, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8713518381118774, + "num_tokens": 358710192.0, + "step": 9400 + }, + { + "epoch": 1.1959038290293855, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.91386878490448, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8499188423156738, + "num_tokens": 358749284.0, + "step": 9401 + }, + { + "epoch": 1.196031039307976, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0751285552978516, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8585454225540161, + "num_tokens": 358782830.0, + "step": 9402 + }, + { + "epoch": 1.1961582495865666, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0407662391662598, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8615230321884155, + "num_tokens": 358820696.0, + "step": 9403 + }, + { + "epoch": 1.196285459865157, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.301929235458374, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8700993657112122, + "num_tokens": 358853616.0, + "step": 9404 + }, + { + "epoch": 1.1964126701437476, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.772789478302002, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8732744455337524, + "num_tokens": 358893420.0, + "step": 9405 + }, + { + "epoch": 1.1965398804223382, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1208410263061523, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8652507066726685, + "num_tokens": 358930704.0, + "step": 9406 + }, + { + "epoch": 1.1966670907009287, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9396380186080933, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8769108653068542, + "num_tokens": 358967730.0, + "step": 9407 + }, + { + "epoch": 1.1967943009795192, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8202382326126099, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8764317631721497, + "num_tokens": 359007241.0, + "step": 9408 + }, + { + "epoch": 1.1969215112581097, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0126802921295166, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8505305647850037, + "num_tokens": 359045573.0, + "step": 9409 + }, + { + "epoch": 1.1970487215367003, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.973681926727295, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8672190308570862, + "num_tokens": 359083420.0, + "step": 9410 + }, + { + "epoch": 1.1971759318152906, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.010735034942627, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8765251636505127, + "num_tokens": 359117555.0, + "step": 9411 + }, + { + "epoch": 1.1973031420938811, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.3041040897369385, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8609569072723389, + "num_tokens": 359154597.0, + "step": 9412 + }, + { + "epoch": 1.1974303523724716, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9497296810150146, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8575891256332397, + "num_tokens": 359190074.0, + "step": 9413 + }, + { + "epoch": 1.1975575626510622, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9430487155914307, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8507977724075317, + "num_tokens": 359232711.0, + "step": 9414 + }, + { + "epoch": 1.1976847729296527, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.364823818206787, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8620299100875854, + "num_tokens": 359265582.0, + "step": 9415 + }, + { + "epoch": 1.1978119832082432, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8946079015731812, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8721669912338257, + "num_tokens": 359303177.0, + "step": 9416 + }, + { + "epoch": 1.1979391934868338, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8214397430419922, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8727524280548096, + "num_tokens": 359338712.0, + "step": 9417 + }, + { + "epoch": 1.1980664037654243, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9273171424865723, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8712042570114136, + "num_tokens": 359372849.0, + "step": 9418 + }, + { + "epoch": 1.1981936140440148, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0546250343322754, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.868385910987854, + "num_tokens": 359407981.0, + "step": 9419 + }, + { + "epoch": 1.1983208243226053, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8699032068252563, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8449877500534058, + "num_tokens": 359450180.0, + "step": 9420 + }, + { + "epoch": 1.1984480346011959, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9579545259475708, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8637466430664062, + "num_tokens": 359487909.0, + "step": 9421 + }, + { + "epoch": 1.1985752448797862, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.863950490951538, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8746607899665833, + "num_tokens": 359522609.0, + "step": 9422 + }, + { + "epoch": 1.1987024551583767, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.0604443550109863, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8611929416656494, + "num_tokens": 359560735.0, + "step": 9423 + }, + { + "epoch": 1.1988296654369672, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9267553091049194, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8576358556747437, + "num_tokens": 359602994.0, + "step": 9424 + }, + { + "epoch": 1.1989568757155578, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8650935888290405, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8639047145843506, + "num_tokens": 359638864.0, + "step": 9425 + }, + { + "epoch": 1.1990840859941483, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1175825595855713, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8713189959526062, + "num_tokens": 359674202.0, + "step": 9426 + }, + { + "epoch": 1.1992112962727388, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0301594734191895, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8693957924842834, + "num_tokens": 359710408.0, + "step": 9427 + }, + { + "epoch": 1.1993385065513293, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8696359395980835, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8715529441833496, + "num_tokens": 359749068.0, + "step": 9428 + }, + { + "epoch": 1.1994657168299199, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9632694721221924, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8644778728485107, + "num_tokens": 359788449.0, + "step": 9429 + }, + { + "epoch": 1.1995929271085104, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0186877250671387, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8588442206382751, + "num_tokens": 359824167.0, + "step": 9430 + }, + { + "epoch": 1.199720137387101, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9929940700531006, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8587837219238281, + "num_tokens": 359861476.0, + "step": 9431 + }, + { + "epoch": 1.1998473476656915, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8833842277526855, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8627306818962097, + "num_tokens": 359906463.0, + "step": 9432 + }, + { + "epoch": 1.199974557944282, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8748345375061035, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8659586906433105, + "num_tokens": 359940576.0, + "step": 9433 + }, + { + "epoch": 1.2001017682228725, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9423041343688965, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8745975494384766, + "num_tokens": 359973616.0, + "step": 9434 + }, + { + "epoch": 1.200228978501463, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.932741641998291, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8758562207221985, + "num_tokens": 360011059.0, + "step": 9435 + }, + { + "epoch": 1.2003561887800533, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8885952234268188, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8735793828964233, + "num_tokens": 360048307.0, + "step": 9436 + }, + { + "epoch": 1.2004833990586439, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9776732921600342, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.86484694480896, + "num_tokens": 360086946.0, + "step": 9437 + }, + { + "epoch": 1.2006106093372344, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9388080835342407, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.868004560470581, + "num_tokens": 360121908.0, + "step": 9438 + }, + { + "epoch": 1.200737819615825, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.034066915512085, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8562441468238831, + "num_tokens": 360164349.0, + "step": 9439 + }, + { + "epoch": 1.2008650298944155, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8531358242034912, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.879315972328186, + "num_tokens": 360197660.0, + "step": 9440 + }, + { + "epoch": 1.200992240173006, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9447213411331177, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8431321382522583, + "num_tokens": 360240297.0, + "step": 9441 + }, + { + "epoch": 1.2011194504515965, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.679689407348633, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8534604907035828, + "num_tokens": 360281566.0, + "step": 9442 + }, + { + "epoch": 1.201246660730187, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1212596893310547, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.869304895401001, + "num_tokens": 360313927.0, + "step": 9443 + }, + { + "epoch": 1.2013738710087776, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9785281419754028, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8673166036605835, + "num_tokens": 360349369.0, + "step": 9444 + }, + { + "epoch": 1.201501081287368, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9643861055374146, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.864395260810852, + "num_tokens": 360386996.0, + "step": 9445 + }, + { + "epoch": 1.2016282915659584, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7555173635482788, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.868998646736145, + "num_tokens": 360424883.0, + "step": 9446 + }, + { + "epoch": 1.201755501844549, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.011378526687622, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8682513236999512, + "num_tokens": 360457992.0, + "step": 9447 + }, + { + "epoch": 1.2018827121231395, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.1020007133483887, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8495073318481445, + "num_tokens": 360497934.0, + "step": 9448 + }, + { + "epoch": 1.20200992240173, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9711003303527832, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8604855537414551, + "num_tokens": 360533928.0, + "step": 9449 + }, + { + "epoch": 1.2021371326803205, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9962539672851562, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8667848110198975, + "num_tokens": 360571076.0, + "step": 9450 + }, + { + "epoch": 1.202264342958911, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.830252766609192, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8566440343856812, + "num_tokens": 360611957.0, + "step": 9451 + }, + { + "epoch": 1.2023915532375016, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.917508602142334, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8576327562332153, + "num_tokens": 360647823.0, + "step": 9452 + }, + { + "epoch": 1.202518763516092, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9757862091064453, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8559625148773193, + "num_tokens": 360688341.0, + "step": 9453 + }, + { + "epoch": 1.2026459737946826, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.917945384979248, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8560726642608643, + "num_tokens": 360730417.0, + "step": 9454 + }, + { + "epoch": 1.2027731840732732, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.039119005203247, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8438645601272583, + "num_tokens": 360771931.0, + "step": 9455 + }, + { + "epoch": 1.2029003943518637, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8371320962905884, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8623322248458862, + "num_tokens": 360811460.0, + "step": 9456 + }, + { + "epoch": 1.2030276046304542, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9405739307403564, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8767861127853394, + "num_tokens": 360849476.0, + "step": 9457 + }, + { + "epoch": 1.2031548149090447, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.923592448234558, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8538910150527954, + "num_tokens": 360891178.0, + "step": 9458 + }, + { + "epoch": 1.2032820251876353, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8493316173553467, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.860389232635498, + "num_tokens": 360930503.0, + "step": 9459 + }, + { + "epoch": 1.2034092354662256, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0485873222351074, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8605910539627075, + "num_tokens": 360963730.0, + "step": 9460 + }, + { + "epoch": 1.203536445744816, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8689684867858887, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8505758047103882, + "num_tokens": 361003796.0, + "step": 9461 + }, + { + "epoch": 1.2036636560234066, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7449595928192139, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8640763759613037, + "num_tokens": 361044818.0, + "step": 9462 + }, + { + "epoch": 1.2037908663019972, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9490966796875, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8749963045120239, + "num_tokens": 361080692.0, + "step": 9463 + }, + { + "epoch": 1.2039180765805877, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9837775230407715, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8551812171936035, + "num_tokens": 361116609.0, + "step": 9464 + }, + { + "epoch": 1.2040452868591782, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0665764808654785, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8573403358459473, + "num_tokens": 361154559.0, + "step": 9465 + }, + { + "epoch": 1.2041724971377687, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8844221830368042, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8727788925170898, + "num_tokens": 361191005.0, + "step": 9466 + }, + { + "epoch": 1.2042997074163593, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8461560010910034, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8620461225509644, + "num_tokens": 361230191.0, + "step": 9467 + }, + { + "epoch": 1.2044269176949498, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.890065312385559, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8577724695205688, + "num_tokens": 361269132.0, + "step": 9468 + }, + { + "epoch": 1.2045541279735403, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8709056377410889, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8564380407333374, + "num_tokens": 361306674.0, + "step": 9469 + }, + { + "epoch": 1.2046813382521309, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.845381259918213, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8649640083312988, + "num_tokens": 361348309.0, + "step": 9470 + }, + { + "epoch": 1.2048085485307212, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8356542587280273, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8589000701904297, + "num_tokens": 361386824.0, + "step": 9471 + }, + { + "epoch": 1.2049357588093117, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9008313417434692, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8449865579605103, + "num_tokens": 361426590.0, + "step": 9472 + }, + { + "epoch": 1.2050629690879022, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9818241596221924, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8498885035514832, + "num_tokens": 361463259.0, + "step": 9473 + }, + { + "epoch": 1.2051901793664928, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9817273616790771, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8720324039459229, + "num_tokens": 361498750.0, + "step": 9474 + }, + { + "epoch": 1.2053173896450833, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9958378076553345, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8530082702636719, + "num_tokens": 361537101.0, + "step": 9475 + }, + { + "epoch": 1.2054445999236738, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.013518810272217, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8658242225646973, + "num_tokens": 361572130.0, + "step": 9476 + }, + { + "epoch": 1.2055718102022643, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8952609300613403, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.879397451877594, + "num_tokens": 361608484.0, + "step": 9477 + }, + { + "epoch": 1.2056990204808549, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8372366428375244, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8808627724647522, + "num_tokens": 361650361.0, + "step": 9478 + }, + { + "epoch": 1.2058262307594454, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1943414211273193, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8724238276481628, + "num_tokens": 361680670.0, + "step": 9479 + }, + { + "epoch": 1.205953441038036, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8409568071365356, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8694976568222046, + "num_tokens": 361723504.0, + "step": 9480 + }, + { + "epoch": 1.2060806513166265, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8360735177993774, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8695875406265259, + "num_tokens": 361765850.0, + "step": 9481 + }, + { + "epoch": 1.206207861595217, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8188711404800415, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8556908369064331, + "num_tokens": 361804375.0, + "step": 9482 + }, + { + "epoch": 1.2063350718738075, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9962384700775146, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8609269857406616, + "num_tokens": 361834714.0, + "step": 9483 + }, + { + "epoch": 1.206462282152398, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9367725849151611, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8638732433319092, + "num_tokens": 361877568.0, + "step": 9484 + }, + { + "epoch": 1.2065894924309883, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.964030146598816, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8596197962760925, + "num_tokens": 361914785.0, + "step": 9485 + }, + { + "epoch": 1.2067167027095789, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7247893810272217, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8707501292228699, + "num_tokens": 361956781.0, + "step": 9486 + }, + { + "epoch": 1.2068439129881694, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7637033462524414, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8735631704330444, + "num_tokens": 361997408.0, + "step": 9487 + }, + { + "epoch": 1.20697112326676, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9932581186294556, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.864533543586731, + "num_tokens": 362030226.0, + "step": 9488 + }, + { + "epoch": 1.2070983335453505, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8227144479751587, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8746675848960876, + "num_tokens": 362068370.0, + "step": 9489 + }, + { + "epoch": 1.207225543823941, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9882315397262573, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8713071942329407, + "num_tokens": 362108569.0, + "step": 9490 + }, + { + "epoch": 1.2073527541025315, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.246232271194458, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.855013370513916, + "num_tokens": 362140476.0, + "step": 9491 + }, + { + "epoch": 1.207479964381122, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1830062866210938, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8704641461372375, + "num_tokens": 362171132.0, + "step": 9492 + }, + { + "epoch": 1.2076071746597126, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1001713275909424, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8629874587059021, + "num_tokens": 362204791.0, + "step": 9493 + }, + { + "epoch": 1.207734384938303, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9587985277175903, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8672860264778137, + "num_tokens": 362242360.0, + "step": 9494 + }, + { + "epoch": 1.2078615952168934, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0975148677825928, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8671508431434631, + "num_tokens": 362276558.0, + "step": 9495 + }, + { + "epoch": 1.207988805495484, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4480392932891846, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8493508100509644, + "num_tokens": 362307055.0, + "step": 9496 + }, + { + "epoch": 1.2081160157740745, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1066691875457764, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8746724128723145, + "num_tokens": 362342433.0, + "step": 9497 + }, + { + "epoch": 1.208243226052665, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1969943046569824, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8847991228103638, + "num_tokens": 362377008.0, + "step": 9498 + }, + { + "epoch": 1.2083704363312555, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2105391025543213, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8779664039611816, + "num_tokens": 362412089.0, + "step": 9499 + }, + { + "epoch": 1.208497646609846, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9127490520477295, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8549211621284485, + "num_tokens": 362455817.0, + "step": 9500 + }, + { + "epoch": 1.2086248568884366, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7989108562469482, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8650562763214111, + "num_tokens": 362498859.0, + "step": 9501 + }, + { + "epoch": 1.208752067167027, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.893571138381958, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.864764928817749, + "num_tokens": 362542346.0, + "step": 9502 + }, + { + "epoch": 1.2088792774456176, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1533350944519043, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8815656900405884, + "num_tokens": 362575360.0, + "step": 9503 + }, + { + "epoch": 1.2090064877242082, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.933220386505127, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.864623486995697, + "num_tokens": 362614737.0, + "step": 9504 + }, + { + "epoch": 1.2091336980027987, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0161361694335938, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8599041700363159, + "num_tokens": 362653756.0, + "step": 9505 + }, + { + "epoch": 1.2092609082813892, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.867244839668274, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8654274940490723, + "num_tokens": 362690745.0, + "step": 9506 + }, + { + "epoch": 1.2093881185599797, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9402964115142822, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8728641867637634, + "num_tokens": 362726328.0, + "step": 9507 + }, + { + "epoch": 1.2095153288385703, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8418984413146973, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8669032454490662, + "num_tokens": 362764285.0, + "step": 9508 + }, + { + "epoch": 1.2096425391171606, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.195572853088379, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8461942672729492, + "num_tokens": 362794697.0, + "step": 9509 + }, + { + "epoch": 1.209769749395751, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.068049669265747, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8620607256889343, + "num_tokens": 362831618.0, + "step": 9510 + }, + { + "epoch": 1.2098969596743416, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.190908670425415, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8727594614028931, + "num_tokens": 362866404.0, + "step": 9511 + }, + { + "epoch": 1.2100241699529322, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.006747007369995, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8808228969573975, + "num_tokens": 362902554.0, + "step": 9512 + }, + { + "epoch": 1.2101513802315227, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8475755453109741, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8586229085922241, + "num_tokens": 362944169.0, + "step": 9513 + }, + { + "epoch": 1.2102785905101132, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9050475358963013, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8429208397865295, + "num_tokens": 362983789.0, + "step": 9514 + }, + { + "epoch": 1.2104058007887037, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.00704026222229, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8703032732009888, + "num_tokens": 363021551.0, + "step": 9515 + }, + { + "epoch": 1.2105330110672943, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8916698694229126, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8468948602676392, + "num_tokens": 363064951.0, + "step": 9516 + }, + { + "epoch": 1.2106602213458848, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8331156969070435, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8632293343544006, + "num_tokens": 363110876.0, + "step": 9517 + }, + { + "epoch": 1.2107874316244753, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.790645956993103, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8621432781219482, + "num_tokens": 363155310.0, + "step": 9518 + }, + { + "epoch": 1.2109146419030659, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.801650047302246, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8755612969398499, + "num_tokens": 363193166.0, + "step": 9519 + }, + { + "epoch": 1.2110418521816562, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.2583959102630615, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8450152277946472, + "num_tokens": 363225163.0, + "step": 9520 + }, + { + "epoch": 1.2111690624602467, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8801547288894653, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8627092242240906, + "num_tokens": 363264720.0, + "step": 9521 + }, + { + "epoch": 1.2112962727388372, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8827465772628784, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8482794761657715, + "num_tokens": 363305029.0, + "step": 9522 + }, + { + "epoch": 1.2114234830174277, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 20.454362869262695, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8713992238044739, + "num_tokens": 363344128.0, + "step": 9523 + }, + { + "epoch": 1.2115506932960183, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.060929536819458, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8633511066436768, + "num_tokens": 363385903.0, + "step": 9524 + }, + { + "epoch": 1.2116779035746088, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.922009825706482, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8651387095451355, + "num_tokens": 363425216.0, + "step": 9525 + }, + { + "epoch": 1.2118051138531993, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.844651222229004, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8736896514892578, + "num_tokens": 363462691.0, + "step": 9526 + }, + { + "epoch": 1.2119323241317899, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.034518241882324, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8493219017982483, + "num_tokens": 363495849.0, + "step": 9527 + }, + { + "epoch": 1.2120595344103804, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9090632200241089, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8627619743347168, + "num_tokens": 363535268.0, + "step": 9528 + }, + { + "epoch": 1.212186744688971, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9720414876937866, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8470040559768677, + "num_tokens": 363573523.0, + "step": 9529 + }, + { + "epoch": 1.2123139549675614, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9621344804763794, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8633129000663757, + "num_tokens": 363607545.0, + "step": 9530 + }, + { + "epoch": 1.212441165246152, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8130879402160645, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8659780621528625, + "num_tokens": 363648456.0, + "step": 9531 + }, + { + "epoch": 1.2125683755247425, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.775225281715393, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8737009763717651, + "num_tokens": 363688726.0, + "step": 9532 + }, + { + "epoch": 1.212695585803333, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8627698421478271, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8567282557487488, + "num_tokens": 363728557.0, + "step": 9533 + }, + { + "epoch": 1.2128227960819233, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9295800924301147, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.864128828048706, + "num_tokens": 363764191.0, + "step": 9534 + }, + { + "epoch": 1.2129500063605139, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.308264970779419, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8515865802764893, + "num_tokens": 363798126.0, + "step": 9535 + }, + { + "epoch": 1.2130772166391044, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.073780059814453, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8474573493003845, + "num_tokens": 363836037.0, + "step": 9536 + }, + { + "epoch": 1.213204426917695, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8012568950653076, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8851287364959717, + "num_tokens": 363872359.0, + "step": 9537 + }, + { + "epoch": 1.2133316371962855, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8704646825790405, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.858069121837616, + "num_tokens": 363914206.0, + "step": 9538 + }, + { + "epoch": 1.213458847474876, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8153247833251953, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8650099635124207, + "num_tokens": 363956147.0, + "step": 9539 + }, + { + "epoch": 1.2135860577534665, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9808485507965088, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8806390762329102, + "num_tokens": 363986328.0, + "step": 9540 + }, + { + "epoch": 1.213713268032057, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9671155214309692, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8608299493789673, + "num_tokens": 364024905.0, + "step": 9541 + }, + { + "epoch": 1.2138404783106476, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.828092098236084, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8654593229293823, + "num_tokens": 364068628.0, + "step": 9542 + }, + { + "epoch": 1.213967688589238, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8821909427642822, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8604224920272827, + "num_tokens": 364104261.0, + "step": 9543 + }, + { + "epoch": 1.2140948988678284, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8347768783569336, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8607425689697266, + "num_tokens": 364147524.0, + "step": 9544 + }, + { + "epoch": 1.214222109146419, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.835558295249939, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8655490279197693, + "num_tokens": 364187715.0, + "step": 9545 + }, + { + "epoch": 1.2143493194250095, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.834362268447876, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8747327327728271, + "num_tokens": 364226081.0, + "step": 9546 + }, + { + "epoch": 1.2144765297036, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8375393152236938, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8640847206115723, + "num_tokens": 364268390.0, + "step": 9547 + }, + { + "epoch": 1.2146037399821905, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.739348292350769, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8668350577354431, + "num_tokens": 364310176.0, + "step": 9548 + }, + { + "epoch": 1.214730950260781, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8736780881881714, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8749536275863647, + "num_tokens": 364345652.0, + "step": 9549 + }, + { + "epoch": 1.2148581605393716, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9776238203048706, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8596689105033875, + "num_tokens": 364383810.0, + "step": 9550 + }, + { + "epoch": 1.214985370817962, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0428383350372314, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8678818941116333, + "num_tokens": 364421169.0, + "step": 9551 + }, + { + "epoch": 1.2151125810965526, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.023552894592285, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8578094244003296, + "num_tokens": 364452552.0, + "step": 9552 + }, + { + "epoch": 1.2152397913751432, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0908589363098145, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8631601333618164, + "num_tokens": 364486779.0, + "step": 9553 + }, + { + "epoch": 1.2153670016537337, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8372061252593994, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8736609816551208, + "num_tokens": 364526237.0, + "step": 9554 + }, + { + "epoch": 1.2154942119323242, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9342126846313477, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.870357871055603, + "num_tokens": 364568446.0, + "step": 9555 + }, + { + "epoch": 1.2156214222109147, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.866204857826233, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8709216117858887, + "num_tokens": 364604448.0, + "step": 9556 + }, + { + "epoch": 1.2157486324895053, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8377102613449097, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8619548678398132, + "num_tokens": 364642707.0, + "step": 9557 + }, + { + "epoch": 1.2158758427680956, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.09977126121521, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8595375418663025, + "num_tokens": 364678428.0, + "step": 9558 + }, + { + "epoch": 1.216003053046686, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7791208028793335, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8781821727752686, + "num_tokens": 364718915.0, + "step": 9559 + }, + { + "epoch": 1.2161302633252766, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.817548394203186, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8806685209274292, + "num_tokens": 364756030.0, + "step": 9560 + }, + { + "epoch": 1.2162574736038672, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9078487157821655, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8611209392547607, + "num_tokens": 364796158.0, + "step": 9561 + }, + { + "epoch": 1.2163846838824577, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8873955011367798, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8717796802520752, + "num_tokens": 364832722.0, + "step": 9562 + }, + { + "epoch": 1.2165118941610482, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3583078384399414, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8637847900390625, + "num_tokens": 364874120.0, + "step": 9563 + }, + { + "epoch": 1.2166391044396387, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.098215341567993, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.866612434387207, + "num_tokens": 364913166.0, + "step": 9564 + }, + { + "epoch": 1.2167663147182293, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.180213689804077, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8461039066314697, + "num_tokens": 364949586.0, + "step": 9565 + }, + { + "epoch": 1.2168935249968198, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0629546642303467, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8463043570518494, + "num_tokens": 364989070.0, + "step": 9566 + }, + { + "epoch": 1.2170207352754103, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9317824840545654, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8674888610839844, + "num_tokens": 365027050.0, + "step": 9567 + }, + { + "epoch": 1.2171479455540009, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1579816341400146, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.863223671913147, + "num_tokens": 365060050.0, + "step": 9568 + }, + { + "epoch": 1.2172751558325912, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.049389123916626, + "learning_rate": 1e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8333383798599243, + "num_tokens": 365098030.0, + "step": 9569 + }, + { + "epoch": 1.2174023661111817, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7190910577774048, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8663557767868042, + "num_tokens": 365141143.0, + "step": 9570 + }, + { + "epoch": 1.2175295763897722, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.035248279571533, + "learning_rate": 1e-06, + "loss": 0.525, + "mean_token_accuracy": 0.840749204158783, + "num_tokens": 365174204.0, + "step": 9571 + }, + { + "epoch": 1.2176567866683627, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1585028171539307, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8642555475234985, + "num_tokens": 365207170.0, + "step": 9572 + }, + { + "epoch": 1.2177839969469533, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.985182762145996, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8663376569747925, + "num_tokens": 365248926.0, + "step": 9573 + }, + { + "epoch": 1.2179112072255438, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.946815013885498, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8670763373374939, + "num_tokens": 365283588.0, + "step": 9574 + }, + { + "epoch": 1.2180384175041343, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8874799013137817, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.861247181892395, + "num_tokens": 365324869.0, + "step": 9575 + }, + { + "epoch": 1.2181656277827249, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.071948766708374, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8611695170402527, + "num_tokens": 365366832.0, + "step": 9576 + }, + { + "epoch": 1.2182928380613154, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0104243755340576, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8610249757766724, + "num_tokens": 365401148.0, + "step": 9577 + }, + { + "epoch": 1.218420048339906, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.888326644897461, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8656201362609863, + "num_tokens": 365442303.0, + "step": 9578 + }, + { + "epoch": 1.2185472586184964, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8478164672851562, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8758010268211365, + "num_tokens": 365477877.0, + "step": 9579 + }, + { + "epoch": 1.218674468897087, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.700625419616699, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8516874313354492, + "num_tokens": 365520648.0, + "step": 9580 + }, + { + "epoch": 1.2188016791756775, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0729622840881348, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8457566499710083, + "num_tokens": 365555129.0, + "step": 9581 + }, + { + "epoch": 1.218928889454268, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9337975978851318, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8693017959594727, + "num_tokens": 365594368.0, + "step": 9582 + }, + { + "epoch": 1.2190560997328583, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8832204341888428, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8629303574562073, + "num_tokens": 365634763.0, + "step": 9583 + }, + { + "epoch": 1.2191833100114489, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8305037021636963, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.879213273525238, + "num_tokens": 365675951.0, + "step": 9584 + }, + { + "epoch": 1.2193105202900394, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9074475765228271, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8710160255432129, + "num_tokens": 365710348.0, + "step": 9585 + }, + { + "epoch": 1.21943773056863, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8612661361694336, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8667214512825012, + "num_tokens": 365745166.0, + "step": 9586 + }, + { + "epoch": 1.2195649408472204, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8098011016845703, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8770965337753296, + "num_tokens": 365780122.0, + "step": 9587 + }, + { + "epoch": 1.219692151125811, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.04494309425354, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8622469305992126, + "num_tokens": 365816326.0, + "step": 9588 + }, + { + "epoch": 1.2198193614044015, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9372190237045288, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8376812934875488, + "num_tokens": 365858938.0, + "step": 9589 + }, + { + "epoch": 1.219946571682992, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9103586673736572, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8527594804763794, + "num_tokens": 365893892.0, + "step": 9590 + }, + { + "epoch": 1.2200737819615826, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8385285139083862, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8533576130867004, + "num_tokens": 365938787.0, + "step": 9591 + }, + { + "epoch": 1.220200992240173, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.079580307006836, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8631908893585205, + "num_tokens": 365972664.0, + "step": 9592 + }, + { + "epoch": 1.2203282025187634, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9761853218078613, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8696173429489136, + "num_tokens": 366013367.0, + "step": 9593 + }, + { + "epoch": 1.220455412797354, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.7722349166870117, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8697882890701294, + "num_tokens": 366051637.0, + "step": 9594 + }, + { + "epoch": 1.2205826230759445, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0848639011383057, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8605837821960449, + "num_tokens": 366092713.0, + "step": 9595 + }, + { + "epoch": 1.220709833354535, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.910988688468933, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.864619791507721, + "num_tokens": 366131594.0, + "step": 9596 + }, + { + "epoch": 1.2208370436331255, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8654038906097412, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8583469390869141, + "num_tokens": 366170871.0, + "step": 9597 + }, + { + "epoch": 1.220964253911716, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8859193325042725, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8521018624305725, + "num_tokens": 366212974.0, + "step": 9598 + }, + { + "epoch": 1.2210914641903066, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9709722995758057, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8552727699279785, + "num_tokens": 366255904.0, + "step": 9599 + }, + { + "epoch": 1.221218674468897, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9501339197158813, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8685446977615356, + "num_tokens": 366288887.0, + "step": 9600 + }, + { + "epoch": 1.2213458847474876, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8244409561157227, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8768018484115601, + "num_tokens": 366328796.0, + "step": 9601 + }, + { + "epoch": 1.2214730950260781, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8422954082489014, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8608773350715637, + "num_tokens": 366373093.0, + "step": 9602 + }, + { + "epoch": 1.2216003053046687, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8754817247390747, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8728423714637756, + "num_tokens": 366414353.0, + "step": 9603 + }, + { + "epoch": 1.2217275155832592, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.8383992910385132, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8767103552818298, + "num_tokens": 366450683.0, + "step": 9604 + }, + { + "epoch": 1.2218547258618497, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9339367151260376, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8682754635810852, + "num_tokens": 366487158.0, + "step": 9605 + }, + { + "epoch": 1.2219819361404403, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9985837936401367, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8655787706375122, + "num_tokens": 366529984.0, + "step": 9606 + }, + { + "epoch": 1.2221091464190306, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9599168300628662, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8667464256286621, + "num_tokens": 366564100.0, + "step": 9607 + }, + { + "epoch": 1.222236356697621, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8619617223739624, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8719249367713928, + "num_tokens": 366603752.0, + "step": 9608 + }, + { + "epoch": 1.2223635669762116, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.892132043838501, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8662904500961304, + "num_tokens": 366642170.0, + "step": 9609 + }, + { + "epoch": 1.2224907772548022, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.083527088165283, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8502844572067261, + "num_tokens": 366684038.0, + "step": 9610 + }, + { + "epoch": 1.2226179875333927, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8980637788772583, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8613865971565247, + "num_tokens": 366717832.0, + "step": 9611 + }, + { + "epoch": 1.2227451978119832, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.072957754135132, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8580450415611267, + "num_tokens": 366757999.0, + "step": 9612 + }, + { + "epoch": 1.2228724080905737, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8443472385406494, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8723320960998535, + "num_tokens": 366798533.0, + "step": 9613 + }, + { + "epoch": 1.2229996183691643, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8016815185546875, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8614265322685242, + "num_tokens": 366843066.0, + "step": 9614 + }, + { + "epoch": 1.2231268286477548, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8294721841812134, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8792837262153625, + "num_tokens": 366883190.0, + "step": 9615 + }, + { + "epoch": 1.2232540389263453, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8932796716690063, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8679044246673584, + "num_tokens": 366920642.0, + "step": 9616 + }, + { + "epoch": 1.2233812492049359, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9485574960708618, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8651424646377563, + "num_tokens": 366954468.0, + "step": 9617 + }, + { + "epoch": 1.2235084594835262, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8723158836364746, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8482184410095215, + "num_tokens": 366991170.0, + "step": 9618 + }, + { + "epoch": 1.2236356697621167, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8984200954437256, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8691701889038086, + "num_tokens": 367031833.0, + "step": 9619 + }, + { + "epoch": 1.2237628800407072, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8243645429611206, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.859012246131897, + "num_tokens": 367074709.0, + "step": 9620 + }, + { + "epoch": 1.2238900903192977, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.56056809425354, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8611034154891968, + "num_tokens": 367106820.0, + "step": 9621 + }, + { + "epoch": 1.2240173005978883, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0198819637298584, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.878108024597168, + "num_tokens": 367142298.0, + "step": 9622 + }, + { + "epoch": 1.2241445108764788, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9431755542755127, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8543446063995361, + "num_tokens": 367179957.0, + "step": 9623 + }, + { + "epoch": 1.2242717211550693, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7754756212234497, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.867080807685852, + "num_tokens": 367226800.0, + "step": 9624 + }, + { + "epoch": 1.2243989314336599, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8838714361190796, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8687235713005066, + "num_tokens": 367257168.0, + "step": 9625 + }, + { + "epoch": 1.2245261417122504, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.878408670425415, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8646855354309082, + "num_tokens": 367295254.0, + "step": 9626 + }, + { + "epoch": 1.224653351990841, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8599334955215454, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8516654968261719, + "num_tokens": 367335814.0, + "step": 9627 + }, + { + "epoch": 1.2247805622694314, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7491885423660278, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8608871698379517, + "num_tokens": 367377592.0, + "step": 9628 + }, + { + "epoch": 1.224907772548022, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.581158399581909, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8540264368057251, + "num_tokens": 367408069.0, + "step": 9629 + }, + { + "epoch": 1.2250349828266125, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.02881121635437, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8649535179138184, + "num_tokens": 367444787.0, + "step": 9630 + }, + { + "epoch": 1.225162193105203, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7405999898910522, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8492302894592285, + "num_tokens": 367486848.0, + "step": 9631 + }, + { + "epoch": 1.2252894033837933, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.801133632659912, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8732255697250366, + "num_tokens": 367530456.0, + "step": 9632 + }, + { + "epoch": 1.2254166136623839, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.921883225440979, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8675035834312439, + "num_tokens": 367566513.0, + "step": 9633 + }, + { + "epoch": 1.2255438239409744, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8857871294021606, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.842473030090332, + "num_tokens": 367607757.0, + "step": 9634 + }, + { + "epoch": 1.225671034219565, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.006016731262207, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8664931058883667, + "num_tokens": 367645525.0, + "step": 9635 + }, + { + "epoch": 1.2257982444981554, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8760167360305786, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8712525963783264, + "num_tokens": 367680840.0, + "step": 9636 + }, + { + "epoch": 1.225925454776746, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.876489520072937, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8727870583534241, + "num_tokens": 367717444.0, + "step": 9637 + }, + { + "epoch": 1.2260526650553365, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0176963806152344, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8443553447723389, + "num_tokens": 367756548.0, + "step": 9638 + }, + { + "epoch": 1.226179875333927, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8506989479064941, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8580434918403625, + "num_tokens": 367798341.0, + "step": 9639 + }, + { + "epoch": 1.2263070856125176, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8834890127182007, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8706306219100952, + "num_tokens": 367835087.0, + "step": 9640 + }, + { + "epoch": 1.226434295891108, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.837908148765564, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8651849627494812, + "num_tokens": 367873816.0, + "step": 9641 + }, + { + "epoch": 1.2265615061696984, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8528743982315063, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.85830157995224, + "num_tokens": 367913621.0, + "step": 9642 + }, + { + "epoch": 1.226688716448289, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9334219694137573, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.854840099811554, + "num_tokens": 367946881.0, + "step": 9643 + }, + { + "epoch": 1.2268159267268794, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8814513683319092, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8677114844322205, + "num_tokens": 367980806.0, + "step": 9644 + }, + { + "epoch": 1.22694313700547, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8430620431900024, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8714975714683533, + "num_tokens": 368016041.0, + "step": 9645 + }, + { + "epoch": 1.2270703472840605, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9274770021438599, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.862352192401886, + "num_tokens": 368052331.0, + "step": 9646 + }, + { + "epoch": 1.227197557562651, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8911908864974976, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8726856112480164, + "num_tokens": 368091458.0, + "step": 9647 + }, + { + "epoch": 1.2273247678412416, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7932957410812378, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8523613214492798, + "num_tokens": 368132433.0, + "step": 9648 + }, + { + "epoch": 1.227451978119832, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9112542867660522, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8639196157455444, + "num_tokens": 368171030.0, + "step": 9649 + }, + { + "epoch": 1.2275791883984226, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8557192087173462, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8523937463760376, + "num_tokens": 368211555.0, + "step": 9650 + }, + { + "epoch": 1.2277063986770131, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9609955549240112, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8705126047134399, + "num_tokens": 368252076.0, + "step": 9651 + }, + { + "epoch": 1.2278336089556037, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8868392705917358, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8559063076972961, + "num_tokens": 368291265.0, + "step": 9652 + }, + { + "epoch": 1.2279608192341942, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8035972118377686, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.856368899345398, + "num_tokens": 368332178.0, + "step": 9653 + }, + { + "epoch": 1.2280880295127847, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7399096488952637, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8612493276596069, + "num_tokens": 368374103.0, + "step": 9654 + }, + { + "epoch": 1.2282152397913753, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.07773494720459, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8667576313018799, + "num_tokens": 368411928.0, + "step": 9655 + }, + { + "epoch": 1.2283424500699656, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0715203285217285, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8391051292419434, + "num_tokens": 368449200.0, + "step": 9656 + }, + { + "epoch": 1.228469660348556, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0445821285247803, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8464534282684326, + "num_tokens": 368489935.0, + "step": 9657 + }, + { + "epoch": 1.2285968706271466, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9359616041183472, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8755565881729126, + "num_tokens": 368522227.0, + "step": 9658 + }, + { + "epoch": 1.2287240809057371, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9613417387008667, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8632582426071167, + "num_tokens": 368555084.0, + "step": 9659 + }, + { + "epoch": 1.2288512911843277, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9092949628829956, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8590595722198486, + "num_tokens": 368591845.0, + "step": 9660 + }, + { + "epoch": 1.2289785014629182, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8434842824935913, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.852509081363678, + "num_tokens": 368637841.0, + "step": 9661 + }, + { + "epoch": 1.2291057117415087, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8220206499099731, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8783631324768066, + "num_tokens": 368677470.0, + "step": 9662 + }, + { + "epoch": 1.2292329220200993, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7747788429260254, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8772615194320679, + "num_tokens": 368718412.0, + "step": 9663 + }, + { + "epoch": 1.2293601322986898, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1042640209198, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8524875044822693, + "num_tokens": 368759267.0, + "step": 9664 + }, + { + "epoch": 1.2294873425772803, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.944728136062622, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8538877367973328, + "num_tokens": 368797586.0, + "step": 9665 + }, + { + "epoch": 1.2296145528558708, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.932681918144226, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8553129434585571, + "num_tokens": 368833801.0, + "step": 9666 + }, + { + "epoch": 1.2297417631344612, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8454195261001587, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8475083708763123, + "num_tokens": 368878513.0, + "step": 9667 + }, + { + "epoch": 1.2298689734130517, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8855267763137817, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8558319807052612, + "num_tokens": 368917815.0, + "step": 9668 + }, + { + "epoch": 1.2299961836916422, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.896796703338623, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8528987169265747, + "num_tokens": 368954971.0, + "step": 9669 + }, + { + "epoch": 1.2301233939702327, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.791344404220581, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8683567643165588, + "num_tokens": 368996212.0, + "step": 9670 + }, + { + "epoch": 1.2302506042488233, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.869678020477295, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8766582608222961, + "num_tokens": 369032186.0, + "step": 9671 + }, + { + "epoch": 1.2303778145274138, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.692116618156433, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8681151866912842, + "num_tokens": 369077565.0, + "step": 9672 + }, + { + "epoch": 1.2305050248060043, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1362929344177246, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8693796396255493, + "num_tokens": 369111935.0, + "step": 9673 + }, + { + "epoch": 1.2306322350845948, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.859614372253418, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8605409860610962, + "num_tokens": 369154374.0, + "step": 9674 + }, + { + "epoch": 1.2307594453631854, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7274736166000366, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8546355962753296, + "num_tokens": 369198557.0, + "step": 9675 + }, + { + "epoch": 1.230886655641776, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9998509883880615, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8462100625038147, + "num_tokens": 369237756.0, + "step": 9676 + }, + { + "epoch": 1.2310138659203664, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.040388584136963, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8488779067993164, + "num_tokens": 369270155.0, + "step": 9677 + }, + { + "epoch": 1.231141076198957, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7924710512161255, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8563392758369446, + "num_tokens": 369310870.0, + "step": 9678 + }, + { + "epoch": 1.2312682864775475, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0188918113708496, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8459121584892273, + "num_tokens": 369344679.0, + "step": 9679 + }, + { + "epoch": 1.231395496756138, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9113034009933472, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8629295825958252, + "num_tokens": 369380388.0, + "step": 9680 + }, + { + "epoch": 1.2315227070347283, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.752184271812439, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8737236857414246, + "num_tokens": 369422327.0, + "step": 9681 + }, + { + "epoch": 1.2316499173133189, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9706240892410278, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.869734525680542, + "num_tokens": 369457160.0, + "step": 9682 + }, + { + "epoch": 1.2317771275919094, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8063491582870483, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8629367351531982, + "num_tokens": 369497883.0, + "step": 9683 + }, + { + "epoch": 1.2319043378705, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7351711988449097, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8926042318344116, + "num_tokens": 369540014.0, + "step": 9684 + }, + { + "epoch": 1.2320315481490904, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.797827124595642, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8625872135162354, + "num_tokens": 369580424.0, + "step": 9685 + }, + { + "epoch": 1.232158758427681, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8232303857803345, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.859307050704956, + "num_tokens": 369623615.0, + "step": 9686 + }, + { + "epoch": 1.2322859687062715, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9973807334899902, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8503513932228088, + "num_tokens": 369658408.0, + "step": 9687 + }, + { + "epoch": 1.232413178984862, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9187054634094238, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.858461856842041, + "num_tokens": 369696311.0, + "step": 9688 + }, + { + "epoch": 1.2325403892634526, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9196528196334839, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8499676585197449, + "num_tokens": 369737473.0, + "step": 9689 + }, + { + "epoch": 1.232667599542043, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1088037490844727, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8604069352149963, + "num_tokens": 369770151.0, + "step": 9690 + }, + { + "epoch": 1.2327948098206334, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7870103120803833, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8812191486358643, + "num_tokens": 369813503.0, + "step": 9691 + }, + { + "epoch": 1.232922020099224, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9532283544540405, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8639023900032043, + "num_tokens": 369848768.0, + "step": 9692 + }, + { + "epoch": 1.2330492303778144, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.953183889389038, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8657849431037903, + "num_tokens": 369892510.0, + "step": 9693 + }, + { + "epoch": 1.233176440656405, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9099242687225342, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8711243271827698, + "num_tokens": 369929638.0, + "step": 9694 + }, + { + "epoch": 1.2333036509349955, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.6630805730819702, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8833001852035522, + "num_tokens": 369971201.0, + "step": 9695 + }, + { + "epoch": 1.233430861213586, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.813341498374939, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8588101267814636, + "num_tokens": 370011713.0, + "step": 9696 + }, + { + "epoch": 1.2335580714921766, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.077590227127075, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8421251773834229, + "num_tokens": 370042647.0, + "step": 9697 + }, + { + "epoch": 1.233685281770767, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0183379650115967, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8607702255249023, + "num_tokens": 370083976.0, + "step": 9698 + }, + { + "epoch": 1.2338124920493576, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9173606634140015, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8598987460136414, + "num_tokens": 370126435.0, + "step": 9699 + }, + { + "epoch": 1.2339397023279481, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9243180751800537, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8715389966964722, + "num_tokens": 370165038.0, + "step": 9700 + }, + { + "epoch": 1.2340669126065387, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8948824405670166, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8546727299690247, + "num_tokens": 370203438.0, + "step": 9701 + }, + { + "epoch": 1.2341941228851292, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0605037212371826, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8767719268798828, + "num_tokens": 370235144.0, + "step": 9702 + }, + { + "epoch": 1.2343213331637197, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.93320631980896, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8577291369438171, + "num_tokens": 370278936.0, + "step": 9703 + }, + { + "epoch": 1.2344485434423103, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.906959891319275, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8667435646057129, + "num_tokens": 370312595.0, + "step": 9704 + }, + { + "epoch": 1.2345757537209006, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9210340976715088, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8599146604537964, + "num_tokens": 370351697.0, + "step": 9705 + }, + { + "epoch": 1.234702963999491, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8025211095809937, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.877466082572937, + "num_tokens": 370391264.0, + "step": 9706 + }, + { + "epoch": 1.2348301742780816, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 4.104971408843994, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8705918192863464, + "num_tokens": 370425248.0, + "step": 9707 + }, + { + "epoch": 1.2349573845566721, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.183098554611206, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.841590166091919, + "num_tokens": 370462670.0, + "step": 9708 + }, + { + "epoch": 1.2350845948352627, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.942824363708496, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8430348634719849, + "num_tokens": 370506126.0, + "step": 9709 + }, + { + "epoch": 1.2352118051138532, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8326791524887085, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8725095987319946, + "num_tokens": 370546895.0, + "step": 9710 + }, + { + "epoch": 1.2353390153924437, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.882463812828064, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.867037832736969, + "num_tokens": 370590645.0, + "step": 9711 + }, + { + "epoch": 1.2354662256710343, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9255958795547485, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8660788536071777, + "num_tokens": 370628614.0, + "step": 9712 + }, + { + "epoch": 1.2355934359496248, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9329556226730347, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8650922775268555, + "num_tokens": 370666082.0, + "step": 9713 + }, + { + "epoch": 1.2357206462282153, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.868606448173523, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8475459218025208, + "num_tokens": 370705780.0, + "step": 9714 + }, + { + "epoch": 1.2358478565068058, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.051079750061035, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8676265478134155, + "num_tokens": 370742550.0, + "step": 9715 + }, + { + "epoch": 1.2359750667853961, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9095443487167358, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8675222396850586, + "num_tokens": 370784406.0, + "step": 9716 + }, + { + "epoch": 1.2361022770639867, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9099808931350708, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8588124513626099, + "num_tokens": 370820911.0, + "step": 9717 + }, + { + "epoch": 1.2362294873425772, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8481086492538452, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8739065527915955, + "num_tokens": 370857427.0, + "step": 9718 + }, + { + "epoch": 1.2363566976211677, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8426190614700317, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8531621098518372, + "num_tokens": 370897989.0, + "step": 9719 + }, + { + "epoch": 1.2364839078997583, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.206336498260498, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.850820779800415, + "num_tokens": 370932672.0, + "step": 9720 + }, + { + "epoch": 1.2366111181783488, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0291426181793213, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8534716963768005, + "num_tokens": 370971027.0, + "step": 9721 + }, + { + "epoch": 1.2367383284569393, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7216380834579468, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.873396098613739, + "num_tokens": 371014742.0, + "step": 9722 + }, + { + "epoch": 1.2368655387355298, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9653856754302979, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8466627597808838, + "num_tokens": 371052065.0, + "step": 9723 + }, + { + "epoch": 1.2369927490141204, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8917392492294312, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8404372930526733, + "num_tokens": 371093897.0, + "step": 9724 + }, + { + "epoch": 1.237119959292711, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1875267028808594, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8611892461776733, + "num_tokens": 371132390.0, + "step": 9725 + }, + { + "epoch": 1.2372471695713014, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.927887201309204, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8727359175682068, + "num_tokens": 371170468.0, + "step": 9726 + }, + { + "epoch": 1.237374379849892, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9491857290267944, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8601463437080383, + "num_tokens": 371209323.0, + "step": 9727 + }, + { + "epoch": 1.2375015901284825, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8650075197219849, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.862853467464447, + "num_tokens": 371246526.0, + "step": 9728 + }, + { + "epoch": 1.237628800407073, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9125957489013672, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8666463494300842, + "num_tokens": 371282850.0, + "step": 9729 + }, + { + "epoch": 1.2377560106856633, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.053241729736328, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8716086149215698, + "num_tokens": 371312614.0, + "step": 9730 + }, + { + "epoch": 1.2378832209642538, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8994603157043457, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8650381565093994, + "num_tokens": 371350170.0, + "step": 9731 + }, + { + "epoch": 1.2380104312428444, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8505593538284302, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8699605464935303, + "num_tokens": 371389295.0, + "step": 9732 + }, + { + "epoch": 1.238137641521435, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8858848810195923, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8640766739845276, + "num_tokens": 371430579.0, + "step": 9733 + }, + { + "epoch": 1.2382648518000254, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0225555896759033, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8536931276321411, + "num_tokens": 371465087.0, + "step": 9734 + }, + { + "epoch": 1.238392062078616, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.6503918170928955, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8700054883956909, + "num_tokens": 371503549.0, + "step": 9735 + }, + { + "epoch": 1.2385192723572065, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8798296451568604, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8833783864974976, + "num_tokens": 371542953.0, + "step": 9736 + }, + { + "epoch": 1.238646482635797, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9571683406829834, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8664377927780151, + "num_tokens": 371582541.0, + "step": 9737 + }, + { + "epoch": 1.2387736929143875, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9802782535552979, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8726034164428711, + "num_tokens": 371617221.0, + "step": 9738 + }, + { + "epoch": 1.238900903192978, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0200021266937256, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8658894300460815, + "num_tokens": 371660773.0, + "step": 9739 + }, + { + "epoch": 1.2390281134715684, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9505223035812378, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8578861951828003, + "num_tokens": 371694763.0, + "step": 9740 + }, + { + "epoch": 1.239155323750159, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9456877708435059, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.862123429775238, + "num_tokens": 371734793.0, + "step": 9741 + }, + { + "epoch": 1.2392825340287494, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9222792387008667, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8585808277130127, + "num_tokens": 371775691.0, + "step": 9742 + }, + { + "epoch": 1.23940974430734, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7361944913864136, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8546485900878906, + "num_tokens": 371817319.0, + "step": 9743 + }, + { + "epoch": 1.2395369545859305, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.068467378616333, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8622359037399292, + "num_tokens": 371847431.0, + "step": 9744 + }, + { + "epoch": 1.239664164864521, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0932559967041016, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8764369487762451, + "num_tokens": 371881788.0, + "step": 9745 + }, + { + "epoch": 1.2397913751431116, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0482301712036133, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8582122325897217, + "num_tokens": 371917388.0, + "step": 9746 + }, + { + "epoch": 1.239918585421702, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8670029640197754, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8724014163017273, + "num_tokens": 371957561.0, + "step": 9747 + }, + { + "epoch": 1.2400457957002926, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.9830188751220703, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8564833402633667, + "num_tokens": 371987322.0, + "step": 9748 + }, + { + "epoch": 1.2401730059788831, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9137756824493408, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.865247368812561, + "num_tokens": 372027263.0, + "step": 9749 + }, + { + "epoch": 1.2403002162574737, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9471334218978882, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8676179051399231, + "num_tokens": 372061441.0, + "step": 9750 + }, + { + "epoch": 1.2404274265360642, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8416954278945923, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8681501746177673, + "num_tokens": 372099085.0, + "step": 9751 + }, + { + "epoch": 1.2405546368146547, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9406681060791016, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8651491403579712, + "num_tokens": 372134106.0, + "step": 9752 + }, + { + "epoch": 1.2406818470932452, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.025099754333496, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8714339733123779, + "num_tokens": 372164635.0, + "step": 9753 + }, + { + "epoch": 1.2408090573718356, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8993107080459595, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8691087961196899, + "num_tokens": 372204924.0, + "step": 9754 + }, + { + "epoch": 1.240936267650426, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.352570056915283, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8642230033874512, + "num_tokens": 372240939.0, + "step": 9755 + }, + { + "epoch": 1.2410634779290166, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1246590614318848, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8580076694488525, + "num_tokens": 372272355.0, + "step": 9756 + }, + { + "epoch": 1.2411906882076071, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.858575463294983, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8603047728538513, + "num_tokens": 372309566.0, + "step": 9757 + }, + { + "epoch": 1.2413178984861977, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8775031566619873, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8586980104446411, + "num_tokens": 372347768.0, + "step": 9758 + }, + { + "epoch": 1.2414451087647882, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9642572402954102, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8719640970230103, + "num_tokens": 372381020.0, + "step": 9759 + }, + { + "epoch": 1.2415723190433787, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9516241550445557, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.851398229598999, + "num_tokens": 372422434.0, + "step": 9760 + }, + { + "epoch": 1.2416995293219693, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8994592428207397, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8554753661155701, + "num_tokens": 372459342.0, + "step": 9761 + }, + { + "epoch": 1.2418267396005598, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0930004119873047, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8666843771934509, + "num_tokens": 372496637.0, + "step": 9762 + }, + { + "epoch": 1.2419539498791503, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0025265216827393, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8753687739372253, + "num_tokens": 372532068.0, + "step": 9763 + }, + { + "epoch": 1.2420811601577408, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8425586223602295, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8724422454833984, + "num_tokens": 372565460.0, + "step": 9764 + }, + { + "epoch": 1.2422083704363311, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7460014820098877, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.867287278175354, + "num_tokens": 372605998.0, + "step": 9765 + }, + { + "epoch": 1.2423355807149217, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9303876161575317, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8546208739280701, + "num_tokens": 372642038.0, + "step": 9766 + }, + { + "epoch": 1.2424627909935122, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8102269172668457, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8584434986114502, + "num_tokens": 372678192.0, + "step": 9767 + }, + { + "epoch": 1.2425900012721027, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.939558982849121, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8599483966827393, + "num_tokens": 372712867.0, + "step": 9768 + }, + { + "epoch": 1.2427172115506933, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1311564445495605, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.845880925655365, + "num_tokens": 372745790.0, + "step": 9769 + }, + { + "epoch": 1.2428444218292838, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9029041528701782, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8728989958763123, + "num_tokens": 372777634.0, + "step": 9770 + }, + { + "epoch": 1.2429716321078743, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8272593021392822, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8478662967681885, + "num_tokens": 372820527.0, + "step": 9771 + }, + { + "epoch": 1.2430988423864648, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.750054955482483, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8736507296562195, + "num_tokens": 372858479.0, + "step": 9772 + }, + { + "epoch": 1.2432260526650554, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7449392080307007, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8695050477981567, + "num_tokens": 372895219.0, + "step": 9773 + }, + { + "epoch": 1.243353262943646, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.783558964729309, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8755935430526733, + "num_tokens": 372934491.0, + "step": 9774 + }, + { + "epoch": 1.2434804732222364, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8108599185943604, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.854686975479126, + "num_tokens": 372972892.0, + "step": 9775 + }, + { + "epoch": 1.243607683500827, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.946364164352417, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8664671778678894, + "num_tokens": 373011564.0, + "step": 9776 + }, + { + "epoch": 1.2437348937794175, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8521775007247925, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8654817342758179, + "num_tokens": 373049444.0, + "step": 9777 + }, + { + "epoch": 1.243862104058008, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8909502029418945, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8630554676055908, + "num_tokens": 373085466.0, + "step": 9778 + }, + { + "epoch": 1.2439893143365983, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9548555612564087, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8402680158615112, + "num_tokens": 373124712.0, + "step": 9779 + }, + { + "epoch": 1.2441165246151888, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8126288652420044, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8680148124694824, + "num_tokens": 373160008.0, + "step": 9780 + }, + { + "epoch": 1.2442437348937794, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.072834014892578, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8632733821868896, + "num_tokens": 373192110.0, + "step": 9781 + }, + { + "epoch": 1.24437094517237, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8436309099197388, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8650416135787964, + "num_tokens": 373234432.0, + "step": 9782 + }, + { + "epoch": 1.2444981554509604, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0080642700195312, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8534324169158936, + "num_tokens": 373270725.0, + "step": 9783 + }, + { + "epoch": 1.244625365729551, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9720293283462524, + "learning_rate": 1e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8407078385353088, + "num_tokens": 373307531.0, + "step": 9784 + }, + { + "epoch": 1.2447525760081415, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.786728024482727, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8667041063308716, + "num_tokens": 373351657.0, + "step": 9785 + }, + { + "epoch": 1.244879786286732, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0253031253814697, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8675537109375, + "num_tokens": 373388811.0, + "step": 9786 + }, + { + "epoch": 1.2450069965653225, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.857319712638855, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8734750747680664, + "num_tokens": 373423378.0, + "step": 9787 + }, + { + "epoch": 1.245134206843913, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8872087001800537, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8658192157745361, + "num_tokens": 373460982.0, + "step": 9788 + }, + { + "epoch": 1.2452614171225034, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8050589561462402, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8702819347381592, + "num_tokens": 373501474.0, + "step": 9789 + }, + { + "epoch": 1.245388627401094, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 3.48449969291687, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8667999505996704, + "num_tokens": 373537830.0, + "step": 9790 + }, + { + "epoch": 1.2455158376796844, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9740866422653198, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8555495738983154, + "num_tokens": 373573975.0, + "step": 9791 + }, + { + "epoch": 1.245643047958275, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9777464866638184, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8421875238418579, + "num_tokens": 373612157.0, + "step": 9792 + }, + { + "epoch": 1.2457702582368655, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9001702070236206, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8819180727005005, + "num_tokens": 373645065.0, + "step": 9793 + }, + { + "epoch": 1.245897468515456, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8092008829116821, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8582830429077148, + "num_tokens": 373687658.0, + "step": 9794 + }, + { + "epoch": 1.2460246787940465, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6658356189727783, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8657511472702026, + "num_tokens": 373732262.0, + "step": 9795 + }, + { + "epoch": 1.246151889072637, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8538081645965576, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8440022468566895, + "num_tokens": 373772410.0, + "step": 9796 + }, + { + "epoch": 1.2462790993512276, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.897531270980835, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8644087314605713, + "num_tokens": 373808773.0, + "step": 9797 + }, + { + "epoch": 1.2464063096298181, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0174014568328857, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.859047532081604, + "num_tokens": 373848945.0, + "step": 9798 + }, + { + "epoch": 1.2465335199084087, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8112759590148926, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8584615588188171, + "num_tokens": 373892716.0, + "step": 9799 + }, + { + "epoch": 1.2466607301869992, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8606209754943848, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8796956539154053, + "num_tokens": 373929443.0, + "step": 9800 + }, + { + "epoch": 1.2467879404655897, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9319391250610352, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.864298403263092, + "num_tokens": 373968882.0, + "step": 9801 + }, + { + "epoch": 1.2469151507441802, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7618155479431152, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8775169253349304, + "num_tokens": 374009265.0, + "step": 9802 + }, + { + "epoch": 1.2470423610227706, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.821437954902649, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8474925756454468, + "num_tokens": 374047152.0, + "step": 9803 + }, + { + "epoch": 1.247169571301361, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9645700454711914, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8596813678741455, + "num_tokens": 374083127.0, + "step": 9804 + }, + { + "epoch": 1.2472967815799516, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.795641303062439, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8720740079879761, + "num_tokens": 374127086.0, + "step": 9805 + }, + { + "epoch": 1.2474239918585421, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9567582607269287, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8571405410766602, + "num_tokens": 374163879.0, + "step": 9806 + }, + { + "epoch": 1.2475512021371327, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8395837545394897, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8690110445022583, + "num_tokens": 374206639.0, + "step": 9807 + }, + { + "epoch": 1.2476784124157232, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.036773204803467, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8602527976036072, + "num_tokens": 374244106.0, + "step": 9808 + }, + { + "epoch": 1.2478056226943137, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.793119192123413, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8604366779327393, + "num_tokens": 374285776.0, + "step": 9809 + }, + { + "epoch": 1.2479328329729042, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.840617299079895, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8722479343414307, + "num_tokens": 374323036.0, + "step": 9810 + }, + { + "epoch": 1.2480600432514948, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 5.185566425323486, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8701540231704712, + "num_tokens": 374360204.0, + "step": 9811 + }, + { + "epoch": 1.2481872535300853, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1253936290740967, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.873665452003479, + "num_tokens": 374394961.0, + "step": 9812 + }, + { + "epoch": 1.2483144638086758, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9247037172317505, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8656277656555176, + "num_tokens": 374436142.0, + "step": 9813 + }, + { + "epoch": 1.2484416740872661, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.932835340499878, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8583835363388062, + "num_tokens": 374474263.0, + "step": 9814 + }, + { + "epoch": 1.2485688843658567, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0192575454711914, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8580906391143799, + "num_tokens": 374510250.0, + "step": 9815 + }, + { + "epoch": 1.2486960946444472, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.889662504196167, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8835585117340088, + "num_tokens": 374549579.0, + "step": 9816 + }, + { + "epoch": 1.2488233049230377, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8871897459030151, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8617061972618103, + "num_tokens": 374588631.0, + "step": 9817 + }, + { + "epoch": 1.2489505152016283, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9009767770767212, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8586658239364624, + "num_tokens": 374627092.0, + "step": 9818 + }, + { + "epoch": 1.2490777254802188, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0503740310668945, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8689396381378174, + "num_tokens": 374657797.0, + "step": 9819 + }, + { + "epoch": 1.2492049357588093, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.854461431503296, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8671315312385559, + "num_tokens": 374694011.0, + "step": 9820 + }, + { + "epoch": 1.2493321460373998, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9398505687713623, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8656874895095825, + "num_tokens": 374735127.0, + "step": 9821 + }, + { + "epoch": 1.2494593563159904, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9251900911331177, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8603744506835938, + "num_tokens": 374774170.0, + "step": 9822 + }, + { + "epoch": 1.249586566594581, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0552785396575928, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8650590181350708, + "num_tokens": 374809162.0, + "step": 9823 + }, + { + "epoch": 1.2497137768731714, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8881516456604004, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8553942441940308, + "num_tokens": 374847974.0, + "step": 9824 + }, + { + "epoch": 1.249840987151762, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.687735915184021, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.878348708152771, + "num_tokens": 374894911.0, + "step": 9825 + }, + { + "epoch": 1.2499681974303525, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.858269214630127, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8581722974777222, + "num_tokens": 374933534.0, + "step": 9826 + }, + { + "epoch": 1.250095407708943, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.099306106567383, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.850365936756134, + "num_tokens": 374968183.0, + "step": 9827 + }, + { + "epoch": 1.2502226179875333, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0858242511749268, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8570388555526733, + "num_tokens": 375005973.0, + "step": 9828 + }, + { + "epoch": 1.2503498282661238, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8866878747940063, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8568205237388611, + "num_tokens": 375043446.0, + "step": 9829 + }, + { + "epoch": 1.2504770385447144, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.002568244934082, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8674397468566895, + "num_tokens": 375081518.0, + "step": 9830 + }, + { + "epoch": 1.250604248823305, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9206801652908325, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8562737107276917, + "num_tokens": 375117767.0, + "step": 9831 + }, + { + "epoch": 1.2507314591018954, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1126129627227783, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8391373753547668, + "num_tokens": 375152061.0, + "step": 9832 + }, + { + "epoch": 1.250858669380486, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.878007173538208, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8563933372497559, + "num_tokens": 375195677.0, + "step": 9833 + }, + { + "epoch": 1.2509858796590765, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0488529205322266, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8564124703407288, + "num_tokens": 375230319.0, + "step": 9834 + }, + { + "epoch": 1.251113089937667, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9386544227600098, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8679972887039185, + "num_tokens": 375268970.0, + "step": 9835 + }, + { + "epoch": 1.2512403002162575, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9301224946975708, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8701123595237732, + "num_tokens": 375303661.0, + "step": 9836 + }, + { + "epoch": 1.2513675104948478, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7992160320281982, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8706843852996826, + "num_tokens": 375343278.0, + "step": 9837 + }, + { + "epoch": 1.2514947207734384, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8485195636749268, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8696510791778564, + "num_tokens": 375378574.0, + "step": 9838 + }, + { + "epoch": 1.251621931052029, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.902693271636963, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8576331734657288, + "num_tokens": 375412781.0, + "step": 9839 + }, + { + "epoch": 1.2517491413306194, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.980865240097046, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8754868507385254, + "num_tokens": 375450431.0, + "step": 9840 + }, + { + "epoch": 1.25187635160921, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9873496294021606, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8570075035095215, + "num_tokens": 375490692.0, + "step": 9841 + }, + { + "epoch": 1.2520035618878005, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.939267635345459, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8682640790939331, + "num_tokens": 375535278.0, + "step": 9842 + }, + { + "epoch": 1.252130772166391, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8493926525115967, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8563395738601685, + "num_tokens": 375575859.0, + "step": 9843 + }, + { + "epoch": 1.2522579824449815, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9528840780258179, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8556368350982666, + "num_tokens": 375611084.0, + "step": 9844 + }, + { + "epoch": 1.252385192723572, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8810508251190186, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8723785877227783, + "num_tokens": 375644926.0, + "step": 9845 + }, + { + "epoch": 1.2525124030021626, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0795912742614746, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8718199133872986, + "num_tokens": 375686970.0, + "step": 9846 + }, + { + "epoch": 1.2526396132807531, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8651374578475952, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8746530413627625, + "num_tokens": 375727372.0, + "step": 9847 + }, + { + "epoch": 1.2527668235593437, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9428329467773438, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8519538640975952, + "num_tokens": 375763568.0, + "step": 9848 + }, + { + "epoch": 1.2528940338379342, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0002753734588623, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.846917986869812, + "num_tokens": 375797954.0, + "step": 9849 + }, + { + "epoch": 1.2530212441165247, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1096787452697754, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8611029982566833, + "num_tokens": 375836362.0, + "step": 9850 + }, + { + "epoch": 1.2531484543951152, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9621529579162598, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8584364652633667, + "num_tokens": 375879403.0, + "step": 9851 + }, + { + "epoch": 1.2532756646737058, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.056952714920044, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8663722276687622, + "num_tokens": 375916728.0, + "step": 9852 + }, + { + "epoch": 1.253402874952296, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9140335321426392, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8578870892524719, + "num_tokens": 375951743.0, + "step": 9853 + }, + { + "epoch": 1.2535300852308866, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9947987794876099, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8374764323234558, + "num_tokens": 375988872.0, + "step": 9854 + }, + { + "epoch": 1.2536572955094771, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.026594638824463, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8526527285575867, + "num_tokens": 376030388.0, + "step": 9855 + }, + { + "epoch": 1.2537845057880677, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8552623987197876, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8727453947067261, + "num_tokens": 376062975.0, + "step": 9856 + }, + { + "epoch": 1.2539117160666582, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8933329582214355, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8551591634750366, + "num_tokens": 376101873.0, + "step": 9857 + }, + { + "epoch": 1.2540389263452487, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0968189239501953, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.84184730052948, + "num_tokens": 376138774.0, + "step": 9858 + }, + { + "epoch": 1.2541661366238392, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0646328926086426, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8551272749900818, + "num_tokens": 376172357.0, + "step": 9859 + }, + { + "epoch": 1.2542933469024298, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9876490831375122, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8530389666557312, + "num_tokens": 376212977.0, + "step": 9860 + }, + { + "epoch": 1.2544205571810203, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.90050208568573, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8531256318092346, + "num_tokens": 376256989.0, + "step": 9861 + }, + { + "epoch": 1.2545477674596106, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8796707391738892, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8669565916061401, + "num_tokens": 376300130.0, + "step": 9862 + }, + { + "epoch": 1.2546749777382011, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8121029138565063, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8660516142845154, + "num_tokens": 376337475.0, + "step": 9863 + }, + { + "epoch": 1.2548021880167917, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.91885507106781, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8539301156997681, + "num_tokens": 376376485.0, + "step": 9864 + }, + { + "epoch": 1.2549293982953822, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8983757495880127, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8689960837364197, + "num_tokens": 376413393.0, + "step": 9865 + }, + { + "epoch": 1.2550566085739727, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.889499306678772, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8530222177505493, + "num_tokens": 376448192.0, + "step": 9866 + }, + { + "epoch": 1.2551838188525632, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.874870777130127, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8650332689285278, + "num_tokens": 376489382.0, + "step": 9867 + }, + { + "epoch": 1.2553110291311538, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8884230852127075, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8669571876525879, + "num_tokens": 376527221.0, + "step": 9868 + }, + { + "epoch": 1.2554382394097443, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9527286291122437, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.858939528465271, + "num_tokens": 376567045.0, + "step": 9869 + }, + { + "epoch": 1.2555654496883348, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8288830518722534, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8705773949623108, + "num_tokens": 376607039.0, + "step": 9870 + }, + { + "epoch": 1.2556926599669254, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.188237428665161, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8643412590026855, + "num_tokens": 376643220.0, + "step": 9871 + }, + { + "epoch": 1.255819870245516, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.885492205619812, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.881866455078125, + "num_tokens": 376681417.0, + "step": 9872 + }, + { + "epoch": 1.2559470805241064, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7858554124832153, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8821122646331787, + "num_tokens": 376722836.0, + "step": 9873 + }, + { + "epoch": 1.256074290802697, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8156120777130127, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8554278016090393, + "num_tokens": 376763194.0, + "step": 9874 + }, + { + "epoch": 1.2562015010812875, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9191856384277344, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8587120175361633, + "num_tokens": 376803701.0, + "step": 9875 + }, + { + "epoch": 1.256328711359878, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.96239173412323, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8715770244598389, + "num_tokens": 376835296.0, + "step": 9876 + }, + { + "epoch": 1.2564559216384683, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9101693630218506, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8656009435653687, + "num_tokens": 376876317.0, + "step": 9877 + }, + { + "epoch": 1.2565831319170588, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7687841653823853, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8616409301757812, + "num_tokens": 376916403.0, + "step": 9878 + }, + { + "epoch": 1.2567103421956494, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.928783655166626, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8733383417129517, + "num_tokens": 376956079.0, + "step": 9879 + }, + { + "epoch": 1.25683755247424, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9428812265396118, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8541293144226074, + "num_tokens": 376991618.0, + "step": 9880 + }, + { + "epoch": 1.2569647627528304, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7952549457550049, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8679533004760742, + "num_tokens": 377033964.0, + "step": 9881 + }, + { + "epoch": 1.257091973031421, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9058398008346558, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8714115023612976, + "num_tokens": 377070091.0, + "step": 9882 + }, + { + "epoch": 1.2572191833100115, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8887524604797363, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8719683885574341, + "num_tokens": 377111185.0, + "step": 9883 + }, + { + "epoch": 1.257346393588602, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8318088054656982, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8675731420516968, + "num_tokens": 377147637.0, + "step": 9884 + }, + { + "epoch": 1.2574736038671925, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8627943992614746, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8624119758605957, + "num_tokens": 377181918.0, + "step": 9885 + }, + { + "epoch": 1.2576008141457828, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.747828722000122, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8563898205757141, + "num_tokens": 377222900.0, + "step": 9886 + }, + { + "epoch": 1.2577280244243734, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.145470380783081, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8771020174026489, + "num_tokens": 377263107.0, + "step": 9887 + }, + { + "epoch": 1.257855234702964, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9254264831542969, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8781417012214661, + "num_tokens": 377301384.0, + "step": 9888 + }, + { + "epoch": 1.2579824449815544, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9482108354568481, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8562145233154297, + "num_tokens": 377339865.0, + "step": 9889 + }, + { + "epoch": 1.258109655260145, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8359318971633911, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8720027208328247, + "num_tokens": 377381582.0, + "step": 9890 + }, + { + "epoch": 1.2582368655387355, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8443933725357056, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8410952091217041, + "num_tokens": 377425548.0, + "step": 9891 + }, + { + "epoch": 1.258364075817326, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8530043363571167, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8622088432312012, + "num_tokens": 377462298.0, + "step": 9892 + }, + { + "epoch": 1.2584912860959165, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9674136638641357, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8565667867660522, + "num_tokens": 377503873.0, + "step": 9893 + }, + { + "epoch": 1.258618496374507, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.178032636642456, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8782404661178589, + "num_tokens": 377540497.0, + "step": 9894 + }, + { + "epoch": 1.2587457066530976, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9601161479949951, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8639516830444336, + "num_tokens": 377582242.0, + "step": 9895 + }, + { + "epoch": 1.2588729169316881, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8659335374832153, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8785471320152283, + "num_tokens": 377614972.0, + "step": 9896 + }, + { + "epoch": 1.2590001272102787, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.169201374053955, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8603991270065308, + "num_tokens": 377645392.0, + "step": 9897 + }, + { + "epoch": 1.2591273374888692, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.943954586982727, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8715676069259644, + "num_tokens": 377680317.0, + "step": 9898 + }, + { + "epoch": 1.2592545477674597, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1493654251098633, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8640673160552979, + "num_tokens": 377714879.0, + "step": 9899 + }, + { + "epoch": 1.2593817580460502, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0395567417144775, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8499451279640198, + "num_tokens": 377751404.0, + "step": 9900 + }, + { + "epoch": 1.2595089683246408, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9541462659835815, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.867916464805603, + "num_tokens": 377788354.0, + "step": 9901 + }, + { + "epoch": 1.259636178603231, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.13395094871521, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8528172969818115, + "num_tokens": 377827259.0, + "step": 9902 + }, + { + "epoch": 1.2597633888818216, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0264952182769775, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8564318418502808, + "num_tokens": 377872477.0, + "step": 9903 + }, + { + "epoch": 1.2598905991604121, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0739827156066895, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8548307418823242, + "num_tokens": 377911159.0, + "step": 9904 + }, + { + "epoch": 1.2600178094390027, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9147437810897827, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8660134077072144, + "num_tokens": 377951919.0, + "step": 9905 + }, + { + "epoch": 1.2601450197175932, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9059584140777588, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.845612645149231, + "num_tokens": 377997567.0, + "step": 9906 + }, + { + "epoch": 1.2602722299961837, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0266008377075195, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8618413805961609, + "num_tokens": 378033811.0, + "step": 9907 + }, + { + "epoch": 1.2603994402747742, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1866955757141113, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8505038022994995, + "num_tokens": 378070136.0, + "step": 9908 + }, + { + "epoch": 1.2605266505533648, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3685810565948486, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8609879016876221, + "num_tokens": 378106902.0, + "step": 9909 + }, + { + "epoch": 1.2606538608319553, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1408851146698, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8659747838973999, + "num_tokens": 378144638.0, + "step": 9910 + }, + { + "epoch": 1.2607810711105456, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9490913152694702, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8731681108474731, + "num_tokens": 378179059.0, + "step": 9911 + }, + { + "epoch": 1.2609082813891361, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.33372163772583, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8496347069740295, + "num_tokens": 378210346.0, + "step": 9912 + }, + { + "epoch": 1.2610354916677267, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.04606556892395, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8689872026443481, + "num_tokens": 378242687.0, + "step": 9913 + }, + { + "epoch": 1.2611627019463172, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.030086040496826, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8695917129516602, + "num_tokens": 378279421.0, + "step": 9914 + }, + { + "epoch": 1.2612899122249077, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8947527408599854, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8592886328697205, + "num_tokens": 378318133.0, + "step": 9915 + }, + { + "epoch": 1.2614171225034982, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9428492784500122, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8458939790725708, + "num_tokens": 378362987.0, + "step": 9916 + }, + { + "epoch": 1.2615443327820888, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.996449589729309, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8809062242507935, + "num_tokens": 378396197.0, + "step": 9917 + }, + { + "epoch": 1.2616715430606793, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9135711193084717, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.871580958366394, + "num_tokens": 378433691.0, + "step": 9918 + }, + { + "epoch": 1.2617987533392698, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0144155025482178, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8660438060760498, + "num_tokens": 378470277.0, + "step": 9919 + }, + { + "epoch": 1.2619259636178604, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9047439098358154, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8576014041900635, + "num_tokens": 378510500.0, + "step": 9920 + }, + { + "epoch": 1.2620531738964509, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.15408992767334, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8602727651596069, + "num_tokens": 378552893.0, + "step": 9921 + }, + { + "epoch": 1.2621803841750414, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7867792844772339, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8703932762145996, + "num_tokens": 378590917.0, + "step": 9922 + }, + { + "epoch": 1.262307594453632, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.805046796798706, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8722431063652039, + "num_tokens": 378637764.0, + "step": 9923 + }, + { + "epoch": 1.2624348047322225, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8894919157028198, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8597537279129028, + "num_tokens": 378675497.0, + "step": 9924 + }, + { + "epoch": 1.262562015010813, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0982563495635986, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8547021150588989, + "num_tokens": 378710470.0, + "step": 9925 + }, + { + "epoch": 1.2626892252894033, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7663750648498535, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8588895797729492, + "num_tokens": 378758116.0, + "step": 9926 + }, + { + "epoch": 1.2628164355679938, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9823368787765503, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8598769903182983, + "num_tokens": 378798166.0, + "step": 9927 + }, + { + "epoch": 1.2629436458465844, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7203099727630615, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.878393292427063, + "num_tokens": 378839899.0, + "step": 9928 + }, + { + "epoch": 1.263070856125175, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8796672821044922, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8633975982666016, + "num_tokens": 378886022.0, + "step": 9929 + }, + { + "epoch": 1.2631980664037654, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1757266521453857, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8521684408187866, + "num_tokens": 378925097.0, + "step": 9930 + }, + { + "epoch": 1.263325276682356, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7898856401443481, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8752545118331909, + "num_tokens": 378965242.0, + "step": 9931 + }, + { + "epoch": 1.2634524869609465, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0891060829162598, + "learning_rate": 1e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.8428924083709717, + "num_tokens": 379005932.0, + "step": 9932 + }, + { + "epoch": 1.263579697239537, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9189472198486328, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8607100248336792, + "num_tokens": 379045817.0, + "step": 9933 + }, + { + "epoch": 1.2637069075181275, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8807334899902344, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8564460277557373, + "num_tokens": 379086481.0, + "step": 9934 + }, + { + "epoch": 1.2638341177967178, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.939530611038208, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8468097448348999, + "num_tokens": 379129782.0, + "step": 9935 + }, + { + "epoch": 1.2639613280753084, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0651137828826904, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8606795072555542, + "num_tokens": 379171525.0, + "step": 9936 + }, + { + "epoch": 1.264088538353899, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.859269618988037, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8562257289886475, + "num_tokens": 379213887.0, + "step": 9937 + }, + { + "epoch": 1.2642157486324894, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9134718179702759, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.863142192363739, + "num_tokens": 379252634.0, + "step": 9938 + }, + { + "epoch": 1.26434295891108, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0142898559570312, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8611506223678589, + "num_tokens": 379287402.0, + "step": 9939 + }, + { + "epoch": 1.2644701691896705, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9260276556015015, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8688437938690186, + "num_tokens": 379325592.0, + "step": 9940 + }, + { + "epoch": 1.264597379468261, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9532361030578613, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8509130477905273, + "num_tokens": 379359010.0, + "step": 9941 + }, + { + "epoch": 1.2647245897468515, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9167063236236572, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8557944297790527, + "num_tokens": 379399783.0, + "step": 9942 + }, + { + "epoch": 1.264851800025442, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 16.606548309326172, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8656061887741089, + "num_tokens": 379442207.0, + "step": 9943 + }, + { + "epoch": 1.2649790103040326, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0171148777008057, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8620141744613647, + "num_tokens": 379479561.0, + "step": 9944 + }, + { + "epoch": 1.2651062205826231, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2025790214538574, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8679477572441101, + "num_tokens": 379516413.0, + "step": 9945 + }, + { + "epoch": 1.2652334308612136, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9698337316513062, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8527162075042725, + "num_tokens": 379557993.0, + "step": 9946 + }, + { + "epoch": 1.2653606411398042, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 3.0021095275878906, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8450341820716858, + "num_tokens": 379594028.0, + "step": 9947 + }, + { + "epoch": 1.2654878514183947, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.97133207321167, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.856998860836029, + "num_tokens": 379631235.0, + "step": 9948 + }, + { + "epoch": 1.2656150616969852, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8370834589004517, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8508431315422058, + "num_tokens": 379668714.0, + "step": 9949 + }, + { + "epoch": 1.2657422719755758, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7925281524658203, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8676126003265381, + "num_tokens": 379710233.0, + "step": 9950 + }, + { + "epoch": 1.265869482254166, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.078261613845825, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8584498167037964, + "num_tokens": 379746208.0, + "step": 9951 + }, + { + "epoch": 1.2659966925327566, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7444087266921997, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8713960647583008, + "num_tokens": 379796420.0, + "step": 9952 + }, + { + "epoch": 1.2661239028113471, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9707986116409302, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8643708229064941, + "num_tokens": 379830027.0, + "step": 9953 + }, + { + "epoch": 1.2662511130899377, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.870239019393921, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8620332479476929, + "num_tokens": 379871751.0, + "step": 9954 + }, + { + "epoch": 1.2663783233685282, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0371553897857666, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8564506769180298, + "num_tokens": 379908255.0, + "step": 9955 + }, + { + "epoch": 1.2665055336471187, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9563161134719849, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8647980093955994, + "num_tokens": 379944502.0, + "step": 9956 + }, + { + "epoch": 1.2666327439257092, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.913050651550293, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8584272861480713, + "num_tokens": 379981929.0, + "step": 9957 + }, + { + "epoch": 1.2667599542042998, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9047127962112427, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8586078882217407, + "num_tokens": 380018876.0, + "step": 9958 + }, + { + "epoch": 1.2668871644828903, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9036747217178345, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8578698635101318, + "num_tokens": 380054153.0, + "step": 9959 + }, + { + "epoch": 1.2670143747614806, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.180576801300049, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8541662693023682, + "num_tokens": 380086550.0, + "step": 9960 + }, + { + "epoch": 1.2671415850400711, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.146599531173706, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8577630519866943, + "num_tokens": 380120308.0, + "step": 9961 + }, + { + "epoch": 1.2672687953186617, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.925384283065796, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8461845517158508, + "num_tokens": 380160204.0, + "step": 9962 + }, + { + "epoch": 1.2673960055972522, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.912421464920044, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.865320086479187, + "num_tokens": 380196687.0, + "step": 9963 + }, + { + "epoch": 1.2675232158758427, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.713858723640442, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8779747486114502, + "num_tokens": 380237451.0, + "step": 9964 + }, + { + "epoch": 1.2676504261544332, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8326447010040283, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.858711838722229, + "num_tokens": 380276631.0, + "step": 9965 + }, + { + "epoch": 1.2677776364330238, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8416990041732788, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8675153255462646, + "num_tokens": 380319432.0, + "step": 9966 + }, + { + "epoch": 1.2679048467116143, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8580392599105835, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8698563575744629, + "num_tokens": 380356666.0, + "step": 9967 + }, + { + "epoch": 1.2680320569902048, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.975584626197815, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8708842992782593, + "num_tokens": 380390347.0, + "step": 9968 + }, + { + "epoch": 1.2681592672687954, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9953651428222656, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8662471175193787, + "num_tokens": 380424083.0, + "step": 9969 + }, + { + "epoch": 1.2682864775473859, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8465213775634766, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8578859567642212, + "num_tokens": 380471179.0, + "step": 9970 + }, + { + "epoch": 1.2684136878259764, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9175833463668823, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8641789555549622, + "num_tokens": 380510291.0, + "step": 9971 + }, + { + "epoch": 1.268540898104567, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.6871955394744873, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8672890663146973, + "num_tokens": 380556297.0, + "step": 9972 + }, + { + "epoch": 1.2686681083831575, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.019793748855591, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8722447752952576, + "num_tokens": 380592113.0, + "step": 9973 + }, + { + "epoch": 1.268795318661748, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1284337043762207, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8750796318054199, + "num_tokens": 380623942.0, + "step": 9974 + }, + { + "epoch": 1.2689225289403383, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8755288124084473, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8555004000663757, + "num_tokens": 380662300.0, + "step": 9975 + }, + { + "epoch": 1.2690497392189288, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7809466123580933, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8646634221076965, + "num_tokens": 380701452.0, + "step": 9976 + }, + { + "epoch": 1.2691769494975194, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7997570037841797, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8736669421195984, + "num_tokens": 380741248.0, + "step": 9977 + }, + { + "epoch": 1.2693041597761099, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9247974157333374, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8551810383796692, + "num_tokens": 380781455.0, + "step": 9978 + }, + { + "epoch": 1.2694313700547004, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0254201889038086, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.853422999382019, + "num_tokens": 380818139.0, + "step": 9979 + }, + { + "epoch": 1.269558580333291, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8440123796463013, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8678383827209473, + "num_tokens": 380860530.0, + "step": 9980 + }, + { + "epoch": 1.2696857906118815, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8729009628295898, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8751125335693359, + "num_tokens": 380900825.0, + "step": 9981 + }, + { + "epoch": 1.269813000890472, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8641462326049805, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8604134917259216, + "num_tokens": 380939270.0, + "step": 9982 + }, + { + "epoch": 1.2699402111690625, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8821871280670166, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8400642275810242, + "num_tokens": 380981708.0, + "step": 9983 + }, + { + "epoch": 1.2700674214476528, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9767364263534546, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8688688278198242, + "num_tokens": 381015186.0, + "step": 9984 + }, + { + "epoch": 1.2701946317262434, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8052611351013184, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8728123307228088, + "num_tokens": 381055503.0, + "step": 9985 + }, + { + "epoch": 1.270321842004834, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8784898519515991, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8761350512504578, + "num_tokens": 381091935.0, + "step": 9986 + }, + { + "epoch": 1.2704490522834244, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.171841859817505, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8672984838485718, + "num_tokens": 381127951.0, + "step": 9987 + }, + { + "epoch": 1.270576262562015, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.955109715461731, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8581751585006714, + "num_tokens": 381161918.0, + "step": 9988 + }, + { + "epoch": 1.2707034728406055, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9634294509887695, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8507606983184814, + "num_tokens": 381198747.0, + "step": 9989 + }, + { + "epoch": 1.270830683119196, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8236767053604126, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8640331625938416, + "num_tokens": 381242714.0, + "step": 9990 + }, + { + "epoch": 1.2709578933977865, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1159958839416504, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.872209906578064, + "num_tokens": 381274667.0, + "step": 9991 + }, + { + "epoch": 1.271085103676377, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8823360204696655, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8731070756912231, + "num_tokens": 381318110.0, + "step": 9992 + }, + { + "epoch": 1.2712123139549676, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.8076326847076416, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8639092445373535, + "num_tokens": 381356694.0, + "step": 9993 + }, + { + "epoch": 1.2713395242335581, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9918662309646606, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8514655828475952, + "num_tokens": 381395218.0, + "step": 9994 + }, + { + "epoch": 1.2714667345121486, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9147231578826904, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8773853778839111, + "num_tokens": 381429195.0, + "step": 9995 + }, + { + "epoch": 1.2715939447907392, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9415236711502075, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8566122055053711, + "num_tokens": 381468276.0, + "step": 9996 + }, + { + "epoch": 1.2717211550693297, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.911548137664795, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8706299662590027, + "num_tokens": 381503400.0, + "step": 9997 + }, + { + "epoch": 1.2718483653479202, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.91032075881958, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8681938648223877, + "num_tokens": 381542126.0, + "step": 9998 + }, + { + "epoch": 1.2719755756265108, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7266221046447754, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8717399835586548, + "num_tokens": 381579169.0, + "step": 9999 + }, + { + "epoch": 1.272102785905101, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2553253173828125, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8489377498626709, + "num_tokens": 381615501.0, + "step": 10000 + }, + { + "epoch": 1.2722299961836916, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1312525272369385, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8569245338439941, + "num_tokens": 381649737.0, + "step": 10001 + }, + { + "epoch": 1.2723572064622821, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8526146411895752, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8653364181518555, + "num_tokens": 381693776.0, + "step": 10002 + }, + { + "epoch": 1.2724844167408726, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9850043058395386, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8723968267440796, + "num_tokens": 381733324.0, + "step": 10003 + }, + { + "epoch": 1.2726116270194632, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.13911509513855, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8581146597862244, + "num_tokens": 381765893.0, + "step": 10004 + }, + { + "epoch": 1.2727388372980537, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8762532472610474, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8597962260246277, + "num_tokens": 381802782.0, + "step": 10005 + }, + { + "epoch": 1.2728660475766442, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9198596477508545, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8660786151885986, + "num_tokens": 381843448.0, + "step": 10006 + }, + { + "epoch": 1.2729932578552348, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0097579956054688, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8519811034202576, + "num_tokens": 381878850.0, + "step": 10007 + }, + { + "epoch": 1.2731204681338253, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.037517547607422, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8417450189590454, + "num_tokens": 381915508.0, + "step": 10008 + }, + { + "epoch": 1.2732476784124156, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0344014167785645, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8530006408691406, + "num_tokens": 381952699.0, + "step": 10009 + }, + { + "epoch": 1.2733748886910061, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8621798753738403, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8566233515739441, + "num_tokens": 381989947.0, + "step": 10010 + }, + { + "epoch": 1.2735020989695967, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7742725610733032, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8623650074005127, + "num_tokens": 382027701.0, + "step": 10011 + }, + { + "epoch": 1.2736293092481872, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.853243350982666, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8616867065429688, + "num_tokens": 382066415.0, + "step": 10012 + }, + { + "epoch": 1.2737565195267777, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8092586994171143, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.85700523853302, + "num_tokens": 382108402.0, + "step": 10013 + }, + { + "epoch": 1.2738837298053682, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9220421314239502, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8648180961608887, + "num_tokens": 382146865.0, + "step": 10014 + }, + { + "epoch": 1.2740109400839588, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9651819467544556, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8593286871910095, + "num_tokens": 382186746.0, + "step": 10015 + }, + { + "epoch": 1.2741381503625493, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.243755340576172, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8764537572860718, + "num_tokens": 382220693.0, + "step": 10016 + }, + { + "epoch": 1.2742653606411398, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9295545816421509, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8564333319664001, + "num_tokens": 382259549.0, + "step": 10017 + }, + { + "epoch": 1.2743925709197303, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.067042350769043, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8723474740982056, + "num_tokens": 382299164.0, + "step": 10018 + }, + { + "epoch": 1.2745197811983209, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.889866590499878, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8765929937362671, + "num_tokens": 382333596.0, + "step": 10019 + }, + { + "epoch": 1.2746469914769114, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8239352703094482, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8643603920936584, + "num_tokens": 382372321.0, + "step": 10020 + }, + { + "epoch": 1.274774201755502, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9406288862228394, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8662539124488831, + "num_tokens": 382411125.0, + "step": 10021 + }, + { + "epoch": 1.2749014120340925, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.023768424987793, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8512911796569824, + "num_tokens": 382450935.0, + "step": 10022 + }, + { + "epoch": 1.275028622312683, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8778276443481445, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8607174158096313, + "num_tokens": 382488232.0, + "step": 10023 + }, + { + "epoch": 1.2751558325912733, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9313735961914062, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8597514033317566, + "num_tokens": 382522280.0, + "step": 10024 + }, + { + "epoch": 1.2752830428698638, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 80.52088928222656, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8725253939628601, + "num_tokens": 382562788.0, + "step": 10025 + }, + { + "epoch": 1.2754102531484544, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.055762767791748, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8549377918243408, + "num_tokens": 382598534.0, + "step": 10026 + }, + { + "epoch": 1.2755374634270449, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.161120653152466, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8569455146789551, + "num_tokens": 382641803.0, + "step": 10027 + }, + { + "epoch": 1.2756646737056354, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2056562900543213, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8722847700119019, + "num_tokens": 382677871.0, + "step": 10028 + }, + { + "epoch": 1.275791883984226, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8233052492141724, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8715994358062744, + "num_tokens": 382714989.0, + "step": 10029 + }, + { + "epoch": 1.2759190942628165, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.055706024169922, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8654223680496216, + "num_tokens": 382751919.0, + "step": 10030 + }, + { + "epoch": 1.276046304541407, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.728413701057434, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8738527894020081, + "num_tokens": 382793550.0, + "step": 10031 + }, + { + "epoch": 1.2761735148199975, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8227753639221191, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8720290660858154, + "num_tokens": 382833142.0, + "step": 10032 + }, + { + "epoch": 1.2763007250985878, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.850313663482666, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8712420463562012, + "num_tokens": 382869934.0, + "step": 10033 + }, + { + "epoch": 1.2764279353771784, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0357439517974854, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8483548760414124, + "num_tokens": 382909533.0, + "step": 10034 + }, + { + "epoch": 1.2765551456557689, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8122590780258179, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8694506883621216, + "num_tokens": 382947462.0, + "step": 10035 + }, + { + "epoch": 1.2766823559343594, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8700616359710693, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8642427325248718, + "num_tokens": 382989973.0, + "step": 10036 + }, + { + "epoch": 1.27680956621295, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8152776956558228, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8621717691421509, + "num_tokens": 383033052.0, + "step": 10037 + }, + { + "epoch": 1.2769367764915405, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.845827341079712, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8617935180664062, + "num_tokens": 383075981.0, + "step": 10038 + }, + { + "epoch": 1.277063986770131, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.921479344367981, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8794265985488892, + "num_tokens": 383111984.0, + "step": 10039 + }, + { + "epoch": 1.2771911970487215, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8037110567092896, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8648359775543213, + "num_tokens": 383155323.0, + "step": 10040 + }, + { + "epoch": 1.277318407327312, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0577328205108643, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8612703084945679, + "num_tokens": 383190069.0, + "step": 10041 + }, + { + "epoch": 1.2774456176059026, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.180342197418213, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8543062210083008, + "num_tokens": 383224999.0, + "step": 10042 + }, + { + "epoch": 1.2775728278844931, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9193059206008911, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.856883704662323, + "num_tokens": 383267764.0, + "step": 10043 + }, + { + "epoch": 1.2777000381630836, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9741761684417725, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8724915385246277, + "num_tokens": 383306148.0, + "step": 10044 + }, + { + "epoch": 1.2778272484416742, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0191845893859863, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.867242157459259, + "num_tokens": 383343408.0, + "step": 10045 + }, + { + "epoch": 1.2779544587202647, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9297629594802856, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8860926628112793, + "num_tokens": 383383626.0, + "step": 10046 + }, + { + "epoch": 1.2780816689988552, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9315623044967651, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8474342823028564, + "num_tokens": 383423218.0, + "step": 10047 + }, + { + "epoch": 1.2782088792774458, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.827401041984558, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8700025081634521, + "num_tokens": 383461126.0, + "step": 10048 + }, + { + "epoch": 1.278336089556036, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8204243183135986, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8555455803871155, + "num_tokens": 383501808.0, + "step": 10049 + }, + { + "epoch": 1.2784632998346266, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7866555452346802, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8833974599838257, + "num_tokens": 383542395.0, + "step": 10050 + }, + { + "epoch": 1.2785905101132171, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9646509885787964, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8608207702636719, + "num_tokens": 383579968.0, + "step": 10051 + }, + { + "epoch": 1.2787177203918076, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.948723316192627, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8628872036933899, + "num_tokens": 383617861.0, + "step": 10052 + }, + { + "epoch": 1.2788449306703982, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9625223875045776, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8579505681991577, + "num_tokens": 383652534.0, + "step": 10053 + }, + { + "epoch": 1.2789721409489887, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9818342924118042, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8593262434005737, + "num_tokens": 383693120.0, + "step": 10054 + }, + { + "epoch": 1.2790993512275792, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.981929898262024, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8548555374145508, + "num_tokens": 383734958.0, + "step": 10055 + }, + { + "epoch": 1.2792265615061698, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9868624210357666, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8546243906021118, + "num_tokens": 383768873.0, + "step": 10056 + }, + { + "epoch": 1.2793537717847603, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.849166750907898, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8651880621910095, + "num_tokens": 383804063.0, + "step": 10057 + }, + { + "epoch": 1.2794809820633506, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8676011562347412, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8765466809272766, + "num_tokens": 383843098.0, + "step": 10058 + }, + { + "epoch": 1.2796081923419411, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0740249156951904, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8718550205230713, + "num_tokens": 383879443.0, + "step": 10059 + }, + { + "epoch": 1.2797354026205316, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8810694217681885, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8615517616271973, + "num_tokens": 383918071.0, + "step": 10060 + }, + { + "epoch": 1.2798626128991222, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.217597723007202, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8564620018005371, + "num_tokens": 383953052.0, + "step": 10061 + }, + { + "epoch": 1.2799898231777127, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0059280395507812, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.861119270324707, + "num_tokens": 383992835.0, + "step": 10062 + }, + { + "epoch": 1.2801170334563032, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.055232524871826, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8509042263031006, + "num_tokens": 384031234.0, + "step": 10063 + }, + { + "epoch": 1.2802442437348938, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.015287399291992, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8540136814117432, + "num_tokens": 384063611.0, + "step": 10064 + }, + { + "epoch": 1.2803714540134843, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.694108247756958, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8706934452056885, + "num_tokens": 384106757.0, + "step": 10065 + }, + { + "epoch": 1.2804986642920748, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7627185583114624, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8659831285476685, + "num_tokens": 384143094.0, + "step": 10066 + }, + { + "epoch": 1.2806258745706653, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.978482961654663, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8631768822669983, + "num_tokens": 384181498.0, + "step": 10067 + }, + { + "epoch": 1.2807530848492559, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.184033155441284, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8667192459106445, + "num_tokens": 384209784.0, + "step": 10068 + }, + { + "epoch": 1.2808802951278464, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.949616551399231, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8677606582641602, + "num_tokens": 384248000.0, + "step": 10069 + }, + { + "epoch": 1.281007505406437, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7942625284194946, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8546826839447021, + "num_tokens": 384290897.0, + "step": 10070 + }, + { + "epoch": 1.2811347156850275, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9634886980056763, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8552719354629517, + "num_tokens": 384329464.0, + "step": 10071 + }, + { + "epoch": 1.281261925963618, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8187915086746216, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.866626501083374, + "num_tokens": 384368915.0, + "step": 10072 + }, + { + "epoch": 1.2813891362422083, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9071426391601562, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8726921081542969, + "num_tokens": 384407445.0, + "step": 10073 + }, + { + "epoch": 1.2815163465207988, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9469701051712036, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8593621253967285, + "num_tokens": 384444384.0, + "step": 10074 + }, + { + "epoch": 1.2816435567993893, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9642717838287354, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.858762264251709, + "num_tokens": 384479670.0, + "step": 10075 + }, + { + "epoch": 1.2817707670779799, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7556171417236328, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8514947295188904, + "num_tokens": 384522436.0, + "step": 10076 + }, + { + "epoch": 1.2818979773565704, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9848120212554932, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8553205132484436, + "num_tokens": 384559754.0, + "step": 10077 + }, + { + "epoch": 1.282025187635161, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.971911907196045, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8591450452804565, + "num_tokens": 384595153.0, + "step": 10078 + }, + { + "epoch": 1.2821523979137515, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.940669059753418, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.862380862236023, + "num_tokens": 384628345.0, + "step": 10079 + }, + { + "epoch": 1.282279608192342, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8751899003982544, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8582552075386047, + "num_tokens": 384666725.0, + "step": 10080 + }, + { + "epoch": 1.2824068184709325, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.865585446357727, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8537638187408447, + "num_tokens": 384706217.0, + "step": 10081 + }, + { + "epoch": 1.2825340287495228, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0225818157196045, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8590495586395264, + "num_tokens": 384741835.0, + "step": 10082 + }, + { + "epoch": 1.2826612390281134, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9302350282669067, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8639117479324341, + "num_tokens": 384777273.0, + "step": 10083 + }, + { + "epoch": 1.2827884493067039, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7909538745880127, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8764007091522217, + "num_tokens": 384814950.0, + "step": 10084 + }, + { + "epoch": 1.2829156595852944, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.017272710800171, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8551494479179382, + "num_tokens": 384848088.0, + "step": 10085 + }, + { + "epoch": 1.283042869863885, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8892759084701538, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8541578054428101, + "num_tokens": 384887361.0, + "step": 10086 + }, + { + "epoch": 1.2831700801424755, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7671278715133667, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8694747686386108, + "num_tokens": 384930459.0, + "step": 10087 + }, + { + "epoch": 1.283297290421066, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5753586292266846, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8567860722541809, + "num_tokens": 384969506.0, + "step": 10088 + }, + { + "epoch": 1.2834245006996565, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0400164127349854, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8682198524475098, + "num_tokens": 385004368.0, + "step": 10089 + }, + { + "epoch": 1.283551710978247, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7924072742462158, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8634454607963562, + "num_tokens": 385044553.0, + "step": 10090 + }, + { + "epoch": 1.2836789212568376, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8673772811889648, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8755264282226562, + "num_tokens": 385076873.0, + "step": 10091 + }, + { + "epoch": 1.283806131535428, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.778692364692688, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8659236431121826, + "num_tokens": 385113969.0, + "step": 10092 + }, + { + "epoch": 1.2839333418140186, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8405206203460693, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8732067346572876, + "num_tokens": 385149460.0, + "step": 10093 + }, + { + "epoch": 1.2840605520926092, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0699985027313232, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8449972867965698, + "num_tokens": 385187240.0, + "step": 10094 + }, + { + "epoch": 1.2841877623711997, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9537012577056885, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8554632067680359, + "num_tokens": 385225532.0, + "step": 10095 + }, + { + "epoch": 1.2843149726497902, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.837591290473938, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.865677535533905, + "num_tokens": 385267404.0, + "step": 10096 + }, + { + "epoch": 1.2844421829283807, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8100887537002563, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.850838303565979, + "num_tokens": 385314008.0, + "step": 10097 + }, + { + "epoch": 1.284569393206971, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1180028915405273, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8494052886962891, + "num_tokens": 385352204.0, + "step": 10098 + }, + { + "epoch": 1.2846966034855616, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7934348583221436, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8550981283187866, + "num_tokens": 385393307.0, + "step": 10099 + }, + { + "epoch": 1.2848238137641521, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.970182180404663, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8636438846588135, + "num_tokens": 385426124.0, + "step": 10100 + }, + { + "epoch": 1.2849510240427426, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8957878351211548, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8672734498977661, + "num_tokens": 385467471.0, + "step": 10101 + }, + { + "epoch": 1.2850782343213332, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9729759693145752, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8541951775550842, + "num_tokens": 385508780.0, + "step": 10102 + }, + { + "epoch": 1.2852054445999237, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7736058235168457, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8678163290023804, + "num_tokens": 385552646.0, + "step": 10103 + }, + { + "epoch": 1.2853326548785142, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.074779748916626, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8496737480163574, + "num_tokens": 385586676.0, + "step": 10104 + }, + { + "epoch": 1.2854598651571048, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.978158712387085, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8504025340080261, + "num_tokens": 385623201.0, + "step": 10105 + }, + { + "epoch": 1.2855870754356953, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.805822730064392, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8737209439277649, + "num_tokens": 385662716.0, + "step": 10106 + }, + { + "epoch": 1.2857142857142856, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8778958320617676, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8518073558807373, + "num_tokens": 385706986.0, + "step": 10107 + }, + { + "epoch": 1.2858414959928761, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9343934059143066, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8620575666427612, + "num_tokens": 385744240.0, + "step": 10108 + }, + { + "epoch": 1.2859687062714666, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0072200298309326, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8470351696014404, + "num_tokens": 385778583.0, + "step": 10109 + }, + { + "epoch": 1.2860959165500572, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7497406005859375, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8620721697807312, + "num_tokens": 385819910.0, + "step": 10110 + }, + { + "epoch": 1.2862231268286477, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7669202089309692, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8606144189834595, + "num_tokens": 385860068.0, + "step": 10111 + }, + { + "epoch": 1.2863503371072382, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0756258964538574, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8521789312362671, + "num_tokens": 385890460.0, + "step": 10112 + }, + { + "epoch": 1.2864775473858288, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0280919075012207, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8585302233695984, + "num_tokens": 385926174.0, + "step": 10113 + }, + { + "epoch": 1.2866047576644193, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0496997833251953, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.872553825378418, + "num_tokens": 385962928.0, + "step": 10114 + }, + { + "epoch": 1.2867319679430098, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9123926162719727, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8711625337600708, + "num_tokens": 386002509.0, + "step": 10115 + }, + { + "epoch": 1.2868591782216003, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9438014030456543, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.862414538860321, + "num_tokens": 386043304.0, + "step": 10116 + }, + { + "epoch": 1.2869863885001909, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.868277668952942, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8703403472900391, + "num_tokens": 386081607.0, + "step": 10117 + }, + { + "epoch": 1.2871135987787814, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7545517683029175, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8809075355529785, + "num_tokens": 386117762.0, + "step": 10118 + }, + { + "epoch": 1.287240809057372, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8581395149230957, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.854997456073761, + "num_tokens": 386155908.0, + "step": 10119 + }, + { + "epoch": 1.2873680193359625, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.872699499130249, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8604391813278198, + "num_tokens": 386199385.0, + "step": 10120 + }, + { + "epoch": 1.287495229614553, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.118377685546875, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8573270440101624, + "num_tokens": 386231892.0, + "step": 10121 + }, + { + "epoch": 1.2876224398931433, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9237558841705322, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8460088968276978, + "num_tokens": 386269657.0, + "step": 10122 + }, + { + "epoch": 1.2877496501717338, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8696480989456177, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8669700622558594, + "num_tokens": 386306936.0, + "step": 10123 + }, + { + "epoch": 1.2878768604503243, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9117101430892944, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8622056245803833, + "num_tokens": 386342260.0, + "step": 10124 + }, + { + "epoch": 1.2880040707289149, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.026339292526245, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8591347932815552, + "num_tokens": 386381784.0, + "step": 10125 + }, + { + "epoch": 1.2881312810075054, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8835150003433228, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8616847991943359, + "num_tokens": 386422350.0, + "step": 10126 + }, + { + "epoch": 1.288258491286096, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8217359781265259, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8577253818511963, + "num_tokens": 386467951.0, + "step": 10127 + }, + { + "epoch": 1.2883857015646865, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.749008059501648, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8707484006881714, + "num_tokens": 386511896.0, + "step": 10128 + }, + { + "epoch": 1.288512911843277, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8706555366516113, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8495306968688965, + "num_tokens": 386550704.0, + "step": 10129 + }, + { + "epoch": 1.2886401221218675, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.907770037651062, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.846809983253479, + "num_tokens": 386589295.0, + "step": 10130 + }, + { + "epoch": 1.2887673324004578, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.028352737426758, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8627270460128784, + "num_tokens": 386623462.0, + "step": 10131 + }, + { + "epoch": 1.2888945426790483, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7586932182312012, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8653527498245239, + "num_tokens": 386663036.0, + "step": 10132 + }, + { + "epoch": 1.2890217529576389, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9913537502288818, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8629364967346191, + "num_tokens": 386702530.0, + "step": 10133 + }, + { + "epoch": 1.2891489632362294, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8074123859405518, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.878049910068512, + "num_tokens": 386739758.0, + "step": 10134 + }, + { + "epoch": 1.28927617351482, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9266774654388428, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8717359304428101, + "num_tokens": 386778287.0, + "step": 10135 + }, + { + "epoch": 1.2894033837934105, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3526716232299805, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.873471736907959, + "num_tokens": 386812348.0, + "step": 10136 + }, + { + "epoch": 1.289530594072001, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7494659423828125, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8698077201843262, + "num_tokens": 386855540.0, + "step": 10137 + }, + { + "epoch": 1.2896578043505915, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8257057666778564, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8577991724014282, + "num_tokens": 386893775.0, + "step": 10138 + }, + { + "epoch": 1.289785014629182, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9661897420883179, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8629027009010315, + "num_tokens": 386929553.0, + "step": 10139 + }, + { + "epoch": 1.2899122249077726, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8769360780715942, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8566105961799622, + "num_tokens": 386968033.0, + "step": 10140 + }, + { + "epoch": 1.290039435186363, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8821501731872559, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8612911701202393, + "num_tokens": 387004967.0, + "step": 10141 + }, + { + "epoch": 1.2901666454649536, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0106449127197266, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8510864973068237, + "num_tokens": 387038086.0, + "step": 10142 + }, + { + "epoch": 1.2902938557435442, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8713420629501343, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8379811644554138, + "num_tokens": 387077112.0, + "step": 10143 + }, + { + "epoch": 1.2904210660221347, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1117844581604004, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8458683490753174, + "num_tokens": 387118800.0, + "step": 10144 + }, + { + "epoch": 1.2905482763007252, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8478507995605469, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8570259213447571, + "num_tokens": 387160321.0, + "step": 10145 + }, + { + "epoch": 1.2906754865793157, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9903593063354492, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8599391579627991, + "num_tokens": 387199387.0, + "step": 10146 + }, + { + "epoch": 1.290802696857906, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.821353554725647, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8685264587402344, + "num_tokens": 387238859.0, + "step": 10147 + }, + { + "epoch": 1.2909299071364966, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.941573977470398, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8687838315963745, + "num_tokens": 387272149.0, + "step": 10148 + }, + { + "epoch": 1.291057117415087, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7166283130645752, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.860883355140686, + "num_tokens": 387311617.0, + "step": 10149 + }, + { + "epoch": 1.2911843276936776, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9636199474334717, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8481616377830505, + "num_tokens": 387344281.0, + "step": 10150 + }, + { + "epoch": 1.2913115379722682, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.828248381614685, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8514032363891602, + "num_tokens": 387385959.0, + "step": 10151 + }, + { + "epoch": 1.2914387482508587, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7994539737701416, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8575332164764404, + "num_tokens": 387426197.0, + "step": 10152 + }, + { + "epoch": 1.2915659585294492, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.899484634399414, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8678010702133179, + "num_tokens": 387462250.0, + "step": 10153 + }, + { + "epoch": 1.2916931688080397, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8178194761276245, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8620613813400269, + "num_tokens": 387499903.0, + "step": 10154 + }, + { + "epoch": 1.2918203790866303, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9525119066238403, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8710191249847412, + "num_tokens": 387535361.0, + "step": 10155 + }, + { + "epoch": 1.2919475893652206, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9663251638412476, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8598434925079346, + "num_tokens": 387574300.0, + "step": 10156 + }, + { + "epoch": 1.2920747996438111, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0168983936309814, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8627620935440063, + "num_tokens": 387606860.0, + "step": 10157 + }, + { + "epoch": 1.2922020099224016, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8976558446884155, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8515648245811462, + "num_tokens": 387646519.0, + "step": 10158 + }, + { + "epoch": 1.2923292202009922, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8610334396362305, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8746412992477417, + "num_tokens": 387681100.0, + "step": 10159 + }, + { + "epoch": 1.2924564304795827, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.909906268119812, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.883135974407196, + "num_tokens": 387715962.0, + "step": 10160 + }, + { + "epoch": 1.2925836407581732, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8754099607467651, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.859968364238739, + "num_tokens": 387755856.0, + "step": 10161 + }, + { + "epoch": 1.2927108510367638, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9220731258392334, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8697705864906311, + "num_tokens": 387793256.0, + "step": 10162 + }, + { + "epoch": 1.2928380613153543, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8364800214767456, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8563463687896729, + "num_tokens": 387834696.0, + "step": 10163 + }, + { + "epoch": 1.2929652715939448, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.6486871242523193, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8502177000045776, + "num_tokens": 387874973.0, + "step": 10164 + }, + { + "epoch": 1.2930924818725353, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2891135215759277, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8512581586837769, + "num_tokens": 387904887.0, + "step": 10165 + }, + { + "epoch": 1.2932196921511259, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.915142297744751, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8684340715408325, + "num_tokens": 387944676.0, + "step": 10166 + }, + { + "epoch": 1.2933469024297164, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.034975051879883, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8629584908485413, + "num_tokens": 387982234.0, + "step": 10167 + }, + { + "epoch": 1.293474112708307, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8361619710922241, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8628027439117432, + "num_tokens": 388018001.0, + "step": 10168 + }, + { + "epoch": 1.2936013229868975, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.845559000968933, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8572230935096741, + "num_tokens": 388057476.0, + "step": 10169 + }, + { + "epoch": 1.293728533265488, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8254035711288452, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8801617622375488, + "num_tokens": 388095351.0, + "step": 10170 + }, + { + "epoch": 1.2938557435440783, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8744471073150635, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8629244565963745, + "num_tokens": 388135366.0, + "step": 10171 + }, + { + "epoch": 1.2939829538226688, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.004836320877075, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8506985902786255, + "num_tokens": 388175003.0, + "step": 10172 + }, + { + "epoch": 1.2941101641012593, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0395681858062744, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8435922265052795, + "num_tokens": 388207083.0, + "step": 10173 + }, + { + "epoch": 1.2942373743798499, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7922875881195068, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8775858879089355, + "num_tokens": 388245038.0, + "step": 10174 + }, + { + "epoch": 1.2943645846584404, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8533841371536255, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8579729199409485, + "num_tokens": 388285035.0, + "step": 10175 + }, + { + "epoch": 1.294491794937031, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7781609296798706, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8691081404685974, + "num_tokens": 388323971.0, + "step": 10176 + }, + { + "epoch": 1.2946190052156215, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9228246212005615, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8586138486862183, + "num_tokens": 388362256.0, + "step": 10177 + }, + { + "epoch": 1.294746215494212, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7569332122802734, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.882887065410614, + "num_tokens": 388401120.0, + "step": 10178 + }, + { + "epoch": 1.2948734257728025, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7179526090621948, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8650140762329102, + "num_tokens": 388446833.0, + "step": 10179 + }, + { + "epoch": 1.2950006360513928, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8071107864379883, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8746580481529236, + "num_tokens": 388482266.0, + "step": 10180 + }, + { + "epoch": 1.2951278463299833, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9112014770507812, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.863810658454895, + "num_tokens": 388518024.0, + "step": 10181 + }, + { + "epoch": 1.2952550566085739, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8607158660888672, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8648561239242554, + "num_tokens": 388564246.0, + "step": 10182 + }, + { + "epoch": 1.2953822668871644, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3496270179748535, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.866195023059845, + "num_tokens": 388603556.0, + "step": 10183 + }, + { + "epoch": 1.295509477165755, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.951077938079834, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8714731931686401, + "num_tokens": 388643615.0, + "step": 10184 + }, + { + "epoch": 1.2956366874443455, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8769118785858154, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8660281896591187, + "num_tokens": 388682917.0, + "step": 10185 + }, + { + "epoch": 1.295763897722936, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8357205390930176, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8654003143310547, + "num_tokens": 388720269.0, + "step": 10186 + }, + { + "epoch": 1.2958911080015265, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8793467283248901, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8548101782798767, + "num_tokens": 388759245.0, + "step": 10187 + }, + { + "epoch": 1.296018318280117, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7846331596374512, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8581328988075256, + "num_tokens": 388795673.0, + "step": 10188 + }, + { + "epoch": 1.2961455285587076, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8827970027923584, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8680544495582581, + "num_tokens": 388832926.0, + "step": 10189 + }, + { + "epoch": 1.296272738837298, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.003244400024414, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8666108250617981, + "num_tokens": 388869767.0, + "step": 10190 + }, + { + "epoch": 1.2963999491158886, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2106773853302, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8653166890144348, + "num_tokens": 388913347.0, + "step": 10191 + }, + { + "epoch": 1.2965271593944792, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7654112577438354, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8739200830459595, + "num_tokens": 388948211.0, + "step": 10192 + }, + { + "epoch": 1.2966543696730697, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8189679384231567, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8682504892349243, + "num_tokens": 388989198.0, + "step": 10193 + }, + { + "epoch": 1.2967815799516602, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.866570234298706, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8550665378570557, + "num_tokens": 389029076.0, + "step": 10194 + }, + { + "epoch": 1.2969087902302507, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2790448665618896, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8635108470916748, + "num_tokens": 389066751.0, + "step": 10195 + }, + { + "epoch": 1.297036000508841, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.199042320251465, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8604733347892761, + "num_tokens": 389104207.0, + "step": 10196 + }, + { + "epoch": 1.2971632107874316, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8529072999954224, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8665199875831604, + "num_tokens": 389141941.0, + "step": 10197 + }, + { + "epoch": 1.297290421066022, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6832870244979858, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8734111785888672, + "num_tokens": 389185326.0, + "step": 10198 + }, + { + "epoch": 1.2974176313446126, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8556181192398071, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.872900128364563, + "num_tokens": 389221512.0, + "step": 10199 + }, + { + "epoch": 1.2975448416232032, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7981680631637573, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8712554574012756, + "num_tokens": 389259388.0, + "step": 10200 + }, + { + "epoch": 1.2976720519017937, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0059220790863037, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8507531881332397, + "num_tokens": 389298773.0, + "step": 10201 + }, + { + "epoch": 1.2977992621803842, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.751693844795227, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8641148209571838, + "num_tokens": 389339680.0, + "step": 10202 + }, + { + "epoch": 1.2979264724589747, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7860543727874756, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8657618165016174, + "num_tokens": 389380755.0, + "step": 10203 + }, + { + "epoch": 1.2980536827375653, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.899620771408081, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8675373792648315, + "num_tokens": 389412031.0, + "step": 10204 + }, + { + "epoch": 1.2981808930161556, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8962631225585938, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8741617202758789, + "num_tokens": 389448299.0, + "step": 10205 + }, + { + "epoch": 1.298308103294746, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9944372177124023, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8576775789260864, + "num_tokens": 389483962.0, + "step": 10206 + }, + { + "epoch": 1.2984353135733366, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0134105682373047, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.855596125125885, + "num_tokens": 389521989.0, + "step": 10207 + }, + { + "epoch": 1.2985625238519272, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.883176326751709, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.858323872089386, + "num_tokens": 389558228.0, + "step": 10208 + }, + { + "epoch": 1.2986897341305177, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.882453441619873, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.861035168170929, + "num_tokens": 389597164.0, + "step": 10209 + }, + { + "epoch": 1.2988169444091082, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8458921909332275, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8635796308517456, + "num_tokens": 389636435.0, + "step": 10210 + }, + { + "epoch": 1.2989441546876987, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.815838575363159, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8545966148376465, + "num_tokens": 389670498.0, + "step": 10211 + }, + { + "epoch": 1.2990713649662893, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8319920301437378, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8652822375297546, + "num_tokens": 389707433.0, + "step": 10212 + }, + { + "epoch": 1.2991985752448798, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9270172119140625, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8645745515823364, + "num_tokens": 389747300.0, + "step": 10213 + }, + { + "epoch": 1.2993257855234703, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7908363342285156, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8577120304107666, + "num_tokens": 389788381.0, + "step": 10214 + }, + { + "epoch": 1.2994529958020609, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.055487871170044, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8660677075386047, + "num_tokens": 389821792.0, + "step": 10215 + }, + { + "epoch": 1.2995802060806514, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7588887214660645, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8744969367980957, + "num_tokens": 389859370.0, + "step": 10216 + }, + { + "epoch": 1.299707416359242, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.921284794807434, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.854167103767395, + "num_tokens": 389897020.0, + "step": 10217 + }, + { + "epoch": 1.2998346266378324, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.13529896736145, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8370503187179565, + "num_tokens": 389927107.0, + "step": 10218 + }, + { + "epoch": 1.299961836916423, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.6429121494293213, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8603276014328003, + "num_tokens": 389966323.0, + "step": 10219 + }, + { + "epoch": 1.3000890471950133, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9431934356689453, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8615797162055969, + "num_tokens": 390001416.0, + "step": 10220 + }, + { + "epoch": 1.3002162574736038, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8047176599502563, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8651939630508423, + "num_tokens": 390042326.0, + "step": 10221 + }, + { + "epoch": 1.3003434677521943, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8485512733459473, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8537037968635559, + "num_tokens": 390084443.0, + "step": 10222 + }, + { + "epoch": 1.3004706780307849, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.762479543685913, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8561742305755615, + "num_tokens": 390126309.0, + "step": 10223 + }, + { + "epoch": 1.3005978883093754, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.015575647354126, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8669863343238831, + "num_tokens": 390161463.0, + "step": 10224 + }, + { + "epoch": 1.300725098587966, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8674402236938477, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8727871179580688, + "num_tokens": 390203221.0, + "step": 10225 + }, + { + "epoch": 1.3008523088665565, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.021233081817627, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8795533776283264, + "num_tokens": 390243269.0, + "step": 10226 + }, + { + "epoch": 1.300979519145147, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8376901149749756, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8641517162322998, + "num_tokens": 390282488.0, + "step": 10227 + }, + { + "epoch": 1.3011067294237375, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8531817197799683, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8586744070053101, + "num_tokens": 390320268.0, + "step": 10228 + }, + { + "epoch": 1.3012339397023278, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0950331687927246, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8517338037490845, + "num_tokens": 390354733.0, + "step": 10229 + }, + { + "epoch": 1.3013611499809183, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3160793781280518, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8618652820587158, + "num_tokens": 390388161.0, + "step": 10230 + }, + { + "epoch": 1.3014883602595089, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8158296346664429, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8732292652130127, + "num_tokens": 390424609.0, + "step": 10231 + }, + { + "epoch": 1.3016155705380994, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.358440399169922, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8805791139602661, + "num_tokens": 390460809.0, + "step": 10232 + }, + { + "epoch": 1.30174278081669, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8328239917755127, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8539302349090576, + "num_tokens": 390500198.0, + "step": 10233 + }, + { + "epoch": 1.3018699910952805, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6499950885772705, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8577135801315308, + "num_tokens": 390545588.0, + "step": 10234 + }, + { + "epoch": 1.301997201373871, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8757350444793701, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8535770773887634, + "num_tokens": 390581681.0, + "step": 10235 + }, + { + "epoch": 1.3021244116524615, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6364624500274658, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.880746603012085, + "num_tokens": 390619122.0, + "step": 10236 + }, + { + "epoch": 1.302251621931052, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6909390687942505, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8505075573921204, + "num_tokens": 390667974.0, + "step": 10237 + }, + { + "epoch": 1.3023788322096426, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.132504463195801, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8659595251083374, + "num_tokens": 390705188.0, + "step": 10238 + }, + { + "epoch": 1.302506042488233, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7565438747406006, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8723880052566528, + "num_tokens": 390746892.0, + "step": 10239 + }, + { + "epoch": 1.3026332527668236, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8012573719024658, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8781375885009766, + "num_tokens": 390787727.0, + "step": 10240 + }, + { + "epoch": 1.3027604630454142, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9553775787353516, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8556445837020874, + "num_tokens": 390823702.0, + "step": 10241 + }, + { + "epoch": 1.3028876733240047, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9340006113052368, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8644507527351379, + "num_tokens": 390860256.0, + "step": 10242 + }, + { + "epoch": 1.3030148836025952, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1934728622436523, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8764314651489258, + "num_tokens": 390901676.0, + "step": 10243 + }, + { + "epoch": 1.3031420938811857, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.219484567642212, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8451570868492126, + "num_tokens": 390936911.0, + "step": 10244 + }, + { + "epoch": 1.303269304159776, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.988931655883789, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8509680032730103, + "num_tokens": 390976185.0, + "step": 10245 + }, + { + "epoch": 1.3033965144383666, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9373881816864014, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8628455996513367, + "num_tokens": 391014661.0, + "step": 10246 + }, + { + "epoch": 1.303523724716957, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.088300943374634, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8550059795379639, + "num_tokens": 391056205.0, + "step": 10247 + }, + { + "epoch": 1.3036509349955476, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8431535959243774, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8646348714828491, + "num_tokens": 391092187.0, + "step": 10248 + }, + { + "epoch": 1.3037781452741382, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8998610973358154, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8777095079421997, + "num_tokens": 391130890.0, + "step": 10249 + }, + { + "epoch": 1.3039053555527287, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8056252002716064, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8718154430389404, + "num_tokens": 391172783.0, + "step": 10250 + }, + { + "epoch": 1.3040325658313192, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8378785848617554, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8460375070571899, + "num_tokens": 391215031.0, + "step": 10251 + }, + { + "epoch": 1.3041597761099097, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7811061143875122, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.86667799949646, + "num_tokens": 391251400.0, + "step": 10252 + }, + { + "epoch": 1.3042869863885003, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0427513122558594, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8546562790870667, + "num_tokens": 391285895.0, + "step": 10253 + }, + { + "epoch": 1.3044141966670906, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 3.0568673610687256, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8647267818450928, + "num_tokens": 391328361.0, + "step": 10254 + }, + { + "epoch": 1.304541406945681, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0912766456604004, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8614727258682251, + "num_tokens": 391365019.0, + "step": 10255 + }, + { + "epoch": 1.3046686172242716, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.920876145362854, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8479694128036499, + "num_tokens": 391400407.0, + "step": 10256 + }, + { + "epoch": 1.3047958275028622, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8636337518692017, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.845436692237854, + "num_tokens": 391440660.0, + "step": 10257 + }, + { + "epoch": 1.3049230377814527, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9498554468154907, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8648960590362549, + "num_tokens": 391481096.0, + "step": 10258 + }, + { + "epoch": 1.3050502480600432, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0066211223602295, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8627952337265015, + "num_tokens": 391516356.0, + "step": 10259 + }, + { + "epoch": 1.3051774583386337, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8960314989089966, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8522786498069763, + "num_tokens": 391554691.0, + "step": 10260 + }, + { + "epoch": 1.3053046686172243, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.911952257156372, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8498314619064331, + "num_tokens": 391592473.0, + "step": 10261 + }, + { + "epoch": 1.3054318788958148, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.872052788734436, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.86933833360672, + "num_tokens": 391632091.0, + "step": 10262 + }, + { + "epoch": 1.3055590891744053, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8547393083572388, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8571714162826538, + "num_tokens": 391676121.0, + "step": 10263 + }, + { + "epoch": 1.3056862994529959, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9459805488586426, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8595727682113647, + "num_tokens": 391709664.0, + "step": 10264 + }, + { + "epoch": 1.3058135097315864, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.794135332107544, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8512561917304993, + "num_tokens": 391755828.0, + "step": 10265 + }, + { + "epoch": 1.305940720010177, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7850152254104614, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8791805505752563, + "num_tokens": 391800232.0, + "step": 10266 + }, + { + "epoch": 1.3060679302887674, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9128168821334839, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8656127452850342, + "num_tokens": 391837612.0, + "step": 10267 + }, + { + "epoch": 1.306195140567358, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9308013916015625, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8744043111801147, + "num_tokens": 391871193.0, + "step": 10268 + }, + { + "epoch": 1.3063223508459483, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.066758632659912, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8543741106987, + "num_tokens": 391908203.0, + "step": 10269 + }, + { + "epoch": 1.3064495611245388, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.050328493118286, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8677190542221069, + "num_tokens": 391938744.0, + "step": 10270 + }, + { + "epoch": 1.3065767714031293, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9868794679641724, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8659529685974121, + "num_tokens": 391976020.0, + "step": 10271 + }, + { + "epoch": 1.3067039816817199, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0903398990631104, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8539718985557556, + "num_tokens": 392018006.0, + "step": 10272 + }, + { + "epoch": 1.3068311919603104, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0177462100982666, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8605626225471497, + "num_tokens": 392055358.0, + "step": 10273 + }, + { + "epoch": 1.306958402238901, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2813682556152344, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8601771593093872, + "num_tokens": 392097727.0, + "step": 10274 + }, + { + "epoch": 1.3070856125174914, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8340206146240234, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8684362769126892, + "num_tokens": 392144641.0, + "step": 10275 + }, + { + "epoch": 1.307212822796082, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0175743103027344, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.869922399520874, + "num_tokens": 392178810.0, + "step": 10276 + }, + { + "epoch": 1.3073400330746725, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9607341289520264, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.858154296875, + "num_tokens": 392213571.0, + "step": 10277 + }, + { + "epoch": 1.3074672433532628, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8899223804473877, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8596447706222534, + "num_tokens": 392254628.0, + "step": 10278 + }, + { + "epoch": 1.3075944536318533, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9116857051849365, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8623461723327637, + "num_tokens": 392297243.0, + "step": 10279 + }, + { + "epoch": 1.3077216639104439, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2134246826171875, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.864000678062439, + "num_tokens": 392329116.0, + "step": 10280 + }, + { + "epoch": 1.3078488741890344, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3150174617767334, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8593144416809082, + "num_tokens": 392370542.0, + "step": 10281 + }, + { + "epoch": 1.307976084467625, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9030665159225464, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8679194450378418, + "num_tokens": 392411155.0, + "step": 10282 + }, + { + "epoch": 1.3081032947462155, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9854658842086792, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8642363548278809, + "num_tokens": 392450440.0, + "step": 10283 + }, + { + "epoch": 1.308230505024806, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.160006046295166, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8738236427307129, + "num_tokens": 392488545.0, + "step": 10284 + }, + { + "epoch": 1.3083577153033965, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0682880878448486, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8611093759536743, + "num_tokens": 392527138.0, + "step": 10285 + }, + { + "epoch": 1.308484925581987, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8332918882369995, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8625748157501221, + "num_tokens": 392568181.0, + "step": 10286 + }, + { + "epoch": 1.3086121358605776, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8254506587982178, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8768001794815063, + "num_tokens": 392610900.0, + "step": 10287 + }, + { + "epoch": 1.308739346139168, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.5062613487243652, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8546571135520935, + "num_tokens": 392646624.0, + "step": 10288 + }, + { + "epoch": 1.3088665564177586, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9022016525268555, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8626078367233276, + "num_tokens": 392683103.0, + "step": 10289 + }, + { + "epoch": 1.3089937666963491, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.805987000465393, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8640685081481934, + "num_tokens": 392723169.0, + "step": 10290 + }, + { + "epoch": 1.3091209769749397, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7078332901000977, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8787286877632141, + "num_tokens": 392762323.0, + "step": 10291 + }, + { + "epoch": 1.3092481872535302, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8359105587005615, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8803545236587524, + "num_tokens": 392797555.0, + "step": 10292 + }, + { + "epoch": 1.3093753975321207, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9351788759231567, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8735932111740112, + "num_tokens": 392840001.0, + "step": 10293 + }, + { + "epoch": 1.309502607810711, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1700289249420166, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8624885082244873, + "num_tokens": 392880871.0, + "step": 10294 + }, + { + "epoch": 1.3096298180893016, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9229919910430908, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8742849826812744, + "num_tokens": 392917554.0, + "step": 10295 + }, + { + "epoch": 1.309757028367892, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1401584148406982, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8819959163665771, + "num_tokens": 392958601.0, + "step": 10296 + }, + { + "epoch": 1.3098842386464826, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8326416015625, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8643816709518433, + "num_tokens": 393000108.0, + "step": 10297 + }, + { + "epoch": 1.3100114489250732, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0517616271972656, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8838546276092529, + "num_tokens": 393032321.0, + "step": 10298 + }, + { + "epoch": 1.3101386592036637, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9137905836105347, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8621565699577332, + "num_tokens": 393068347.0, + "step": 10299 + }, + { + "epoch": 1.3102658694822542, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.04807710647583, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8801039457321167, + "num_tokens": 393102509.0, + "step": 10300 + }, + { + "epoch": 1.3103930797608447, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9329997301101685, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8721764087677002, + "num_tokens": 393140974.0, + "step": 10301 + }, + { + "epoch": 1.3105202900394353, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0946178436279297, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8732346892356873, + "num_tokens": 393177214.0, + "step": 10302 + }, + { + "epoch": 1.3106475003180256, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.831953763961792, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8668309450149536, + "num_tokens": 393215801.0, + "step": 10303 + }, + { + "epoch": 1.310774710596616, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8880342245101929, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.873836100101471, + "num_tokens": 393256444.0, + "step": 10304 + }, + { + "epoch": 1.3109019208752066, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7681479454040527, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8605371713638306, + "num_tokens": 393300238.0, + "step": 10305 + }, + { + "epoch": 1.3110291311537972, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9036332368850708, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8492603898048401, + "num_tokens": 393344087.0, + "step": 10306 + }, + { + "epoch": 1.3111563414323877, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7623562812805176, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.868707537651062, + "num_tokens": 393381678.0, + "step": 10307 + }, + { + "epoch": 1.3112835517109782, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.054110527038574, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8627429008483887, + "num_tokens": 393415740.0, + "step": 10308 + }, + { + "epoch": 1.3114107619895687, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9715867042541504, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8388959765434265, + "num_tokens": 393457692.0, + "step": 10309 + }, + { + "epoch": 1.3115379722681593, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.25278377532959, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8605990409851074, + "num_tokens": 393499003.0, + "step": 10310 + }, + { + "epoch": 1.3116651825467498, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1381583213806152, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8523867726325989, + "num_tokens": 393533842.0, + "step": 10311 + }, + { + "epoch": 1.3117923928253403, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.038510799407959, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8712760210037231, + "num_tokens": 393573906.0, + "step": 10312 + }, + { + "epoch": 1.3119196031039309, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.484342575073242, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8616310358047485, + "num_tokens": 393615298.0, + "step": 10313 + }, + { + "epoch": 1.3120468133825214, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.926672101020813, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.85364830493927, + "num_tokens": 393654690.0, + "step": 10314 + }, + { + "epoch": 1.312174023661112, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8557212352752686, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.864875316619873, + "num_tokens": 393696222.0, + "step": 10315 + }, + { + "epoch": 1.3123012339397024, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8933360576629639, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8606204986572266, + "num_tokens": 393736748.0, + "step": 10316 + }, + { + "epoch": 1.312428444218293, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9498368501663208, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8547336459159851, + "num_tokens": 393773077.0, + "step": 10317 + }, + { + "epoch": 1.3125556544968833, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.353358745574951, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8743636012077332, + "num_tokens": 393806466.0, + "step": 10318 + }, + { + "epoch": 1.3126828647754738, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.1606531143188477, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8601594567298889, + "num_tokens": 393840922.0, + "step": 10319 + }, + { + "epoch": 1.3128100750540643, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.301600456237793, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.86871337890625, + "num_tokens": 393874544.0, + "step": 10320 + }, + { + "epoch": 1.3129372853326549, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.89056396484375, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.859763503074646, + "num_tokens": 393917773.0, + "step": 10321 + }, + { + "epoch": 1.3130644956112454, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0221312046051025, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8578565120697021, + "num_tokens": 393951525.0, + "step": 10322 + }, + { + "epoch": 1.313191705889836, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8031470775604248, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8671405911445618, + "num_tokens": 393989796.0, + "step": 10323 + }, + { + "epoch": 1.3133189161684264, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9864699840545654, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8606904745101929, + "num_tokens": 394028746.0, + "step": 10324 + }, + { + "epoch": 1.313446126447017, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.880750060081482, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8717087507247925, + "num_tokens": 394065241.0, + "step": 10325 + }, + { + "epoch": 1.3135733367256075, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.022505521774292, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8575153350830078, + "num_tokens": 394110113.0, + "step": 10326 + }, + { + "epoch": 1.3137005470041978, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9674543142318726, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8459494113922119, + "num_tokens": 394146577.0, + "step": 10327 + }, + { + "epoch": 1.3138277572827883, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0131092071533203, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8640995621681213, + "num_tokens": 394185392.0, + "step": 10328 + }, + { + "epoch": 1.3139549675613789, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9019957780838013, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.853888750076294, + "num_tokens": 394226301.0, + "step": 10329 + }, + { + "epoch": 1.3140821778399694, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9439054727554321, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8545471429824829, + "num_tokens": 394262654.0, + "step": 10330 + }, + { + "epoch": 1.31420938811856, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0737173557281494, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8749575614929199, + "num_tokens": 394299185.0, + "step": 10331 + }, + { + "epoch": 1.3143365983971504, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9638409614562988, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8646843433380127, + "num_tokens": 394331364.0, + "step": 10332 + }, + { + "epoch": 1.314463808675741, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8982752561569214, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8508129119873047, + "num_tokens": 394375479.0, + "step": 10333 + }, + { + "epoch": 1.3145910189543315, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.980873107910156, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8692030310630798, + "num_tokens": 394414913.0, + "step": 10334 + }, + { + "epoch": 1.314718229232922, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.049811363220215, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8635995388031006, + "num_tokens": 394450889.0, + "step": 10335 + }, + { + "epoch": 1.3148454395115126, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9078093767166138, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8472108840942383, + "num_tokens": 394495280.0, + "step": 10336 + }, + { + "epoch": 1.314972649790103, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7254737615585327, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8721437454223633, + "num_tokens": 394536811.0, + "step": 10337 + }, + { + "epoch": 1.3150998600686936, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7993746995925903, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8597933053970337, + "num_tokens": 394573353.0, + "step": 10338 + }, + { + "epoch": 1.3152270703472841, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2786881923675537, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8551689386367798, + "num_tokens": 394603517.0, + "step": 10339 + }, + { + "epoch": 1.3153542806258747, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7432929277420044, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8697102069854736, + "num_tokens": 394650547.0, + "step": 10340 + }, + { + "epoch": 1.3154814909044652, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9176791906356812, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8682816028594971, + "num_tokens": 394684460.0, + "step": 10341 + }, + { + "epoch": 1.3156087011830557, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7386671304702759, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.879228413105011, + "num_tokens": 394733520.0, + "step": 10342 + }, + { + "epoch": 1.315735911461646, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9241920709609985, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8574020266532898, + "num_tokens": 394775498.0, + "step": 10343 + }, + { + "epoch": 1.3158631217402366, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.853100061416626, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.850770115852356, + "num_tokens": 394815138.0, + "step": 10344 + }, + { + "epoch": 1.315990332018827, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1396801471710205, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.856106162071228, + "num_tokens": 394850447.0, + "step": 10345 + }, + { + "epoch": 1.3161175422974176, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3754923343658447, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8567789196968079, + "num_tokens": 394891800.0, + "step": 10346 + }, + { + "epoch": 1.3162447525760081, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9059979915618896, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8689489364624023, + "num_tokens": 394930087.0, + "step": 10347 + }, + { + "epoch": 1.3163719628545987, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.865508794784546, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8719695806503296, + "num_tokens": 394964497.0, + "step": 10348 + }, + { + "epoch": 1.3164991731331892, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1046345233917236, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8447418212890625, + "num_tokens": 395003127.0, + "step": 10349 + }, + { + "epoch": 1.3166263834117797, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2018778324127197, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8538969159126282, + "num_tokens": 395040570.0, + "step": 10350 + }, + { + "epoch": 1.3167535936903703, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.050947427749634, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8648638129234314, + "num_tokens": 395075205.0, + "step": 10351 + }, + { + "epoch": 1.3168808039689606, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8927788734436035, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8618860840797424, + "num_tokens": 395117492.0, + "step": 10352 + }, + { + "epoch": 1.317008014247551, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.831149935722351, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8606841564178467, + "num_tokens": 395162449.0, + "step": 10353 + }, + { + "epoch": 1.3171352245261416, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.147925853729248, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8717478513717651, + "num_tokens": 395197310.0, + "step": 10354 + }, + { + "epoch": 1.3172624348047322, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0670650005340576, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8742021918296814, + "num_tokens": 395232420.0, + "step": 10355 + }, + { + "epoch": 1.3173896450833227, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9726613759994507, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8413358926773071, + "num_tokens": 395269392.0, + "step": 10356 + }, + { + "epoch": 1.3175168553619132, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9934120178222656, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8517424464225769, + "num_tokens": 395305496.0, + "step": 10357 + }, + { + "epoch": 1.3176440656405037, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.785757064819336, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8737149834632874, + "num_tokens": 395347089.0, + "step": 10358 + }, + { + "epoch": 1.3177712759190943, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8515230417251587, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8742923140525818, + "num_tokens": 395382885.0, + "step": 10359 + }, + { + "epoch": 1.3178984861976848, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.108103036880493, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8773860335350037, + "num_tokens": 395416260.0, + "step": 10360 + }, + { + "epoch": 1.3180256964762753, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5503480434417725, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8556689023971558, + "num_tokens": 395454179.0, + "step": 10361 + }, + { + "epoch": 1.3181529067548658, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.191122055053711, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8531777262687683, + "num_tokens": 395489173.0, + "step": 10362 + }, + { + "epoch": 1.3182801170334564, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.037827253341675, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8571200370788574, + "num_tokens": 395532057.0, + "step": 10363 + }, + { + "epoch": 1.318407327312047, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8762280941009521, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8695381283760071, + "num_tokens": 395570321.0, + "step": 10364 + }, + { + "epoch": 1.3185345375906374, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8115414381027222, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8834604024887085, + "num_tokens": 395609041.0, + "step": 10365 + }, + { + "epoch": 1.318661747869228, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8409866094589233, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8404657244682312, + "num_tokens": 395649961.0, + "step": 10366 + }, + { + "epoch": 1.3187889581478183, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.6344592571258545, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8642663955688477, + "num_tokens": 395691882.0, + "step": 10367 + }, + { + "epoch": 1.3189161684264088, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8258302211761475, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8500351905822754, + "num_tokens": 395731496.0, + "step": 10368 + }, + { + "epoch": 1.3190433787049993, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6990691423416138, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8673263192176819, + "num_tokens": 395772618.0, + "step": 10369 + }, + { + "epoch": 1.3191705889835899, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8121775388717651, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8461125493049622, + "num_tokens": 395811907.0, + "step": 10370 + }, + { + "epoch": 1.3192977992621804, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8184059858322144, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.857073962688446, + "num_tokens": 395851186.0, + "step": 10371 + }, + { + "epoch": 1.319425009540771, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7914869785308838, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8634020090103149, + "num_tokens": 395893868.0, + "step": 10372 + }, + { + "epoch": 1.3195522198193614, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0244202613830566, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8732115626335144, + "num_tokens": 395930519.0, + "step": 10373 + }, + { + "epoch": 1.319679430097952, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.810543179512024, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8782517910003662, + "num_tokens": 395965692.0, + "step": 10374 + }, + { + "epoch": 1.3198066403765425, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9163678884506226, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8630096912384033, + "num_tokens": 396002316.0, + "step": 10375 + }, + { + "epoch": 1.3199338506551328, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0661280155181885, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8692165613174438, + "num_tokens": 396043392.0, + "step": 10376 + }, + { + "epoch": 1.3200610609337233, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0069468021392822, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8599730730056763, + "num_tokens": 396078603.0, + "step": 10377 + }, + { + "epoch": 1.3201882712123139, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.823813557624817, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8554927110671997, + "num_tokens": 396121132.0, + "step": 10378 + }, + { + "epoch": 1.3203154814909044, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.123453378677368, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8720683455467224, + "num_tokens": 396155730.0, + "step": 10379 + }, + { + "epoch": 1.320442691769495, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0153980255126953, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8492136597633362, + "num_tokens": 396190976.0, + "step": 10380 + }, + { + "epoch": 1.3205699020480854, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8622618913650513, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8632753491401672, + "num_tokens": 396229571.0, + "step": 10381 + }, + { + "epoch": 1.320697112326676, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9300897121429443, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.86232990026474, + "num_tokens": 396265586.0, + "step": 10382 + }, + { + "epoch": 1.3208243226052665, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1184921264648438, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.868636965751648, + "num_tokens": 396303810.0, + "step": 10383 + }, + { + "epoch": 1.320951532883857, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9750516414642334, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8820459842681885, + "num_tokens": 396345671.0, + "step": 10384 + }, + { + "epoch": 1.3210787431624476, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7458280324935913, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8786910772323608, + "num_tokens": 396388062.0, + "step": 10385 + }, + { + "epoch": 1.321205953441038, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7714792490005493, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.876526951789856, + "num_tokens": 396425189.0, + "step": 10386 + }, + { + "epoch": 1.3213331637196286, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.850955605506897, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8529460430145264, + "num_tokens": 396461861.0, + "step": 10387 + }, + { + "epoch": 1.3214603739982191, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.859621524810791, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8742603063583374, + "num_tokens": 396506190.0, + "step": 10388 + }, + { + "epoch": 1.3215875842768097, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0596885681152344, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8650104999542236, + "num_tokens": 396538900.0, + "step": 10389 + }, + { + "epoch": 1.3217147945554002, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.783388614654541, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8725390434265137, + "num_tokens": 396577029.0, + "step": 10390 + }, + { + "epoch": 1.3218420048339907, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8094208240509033, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8487223982810974, + "num_tokens": 396616917.0, + "step": 10391 + }, + { + "epoch": 1.321969215112581, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8336951732635498, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8717458844184875, + "num_tokens": 396651726.0, + "step": 10392 + }, + { + "epoch": 1.3220964253911716, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8468457460403442, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8520811796188354, + "num_tokens": 396693796.0, + "step": 10393 + }, + { + "epoch": 1.322223635669762, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9553989171981812, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8525742888450623, + "num_tokens": 396731947.0, + "step": 10394 + }, + { + "epoch": 1.3223508459483526, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0445778369903564, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8674749135971069, + "num_tokens": 396767062.0, + "step": 10395 + }, + { + "epoch": 1.3224780562269431, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8889024257659912, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8666380643844604, + "num_tokens": 396803918.0, + "step": 10396 + }, + { + "epoch": 1.3226052665055337, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7648311853408813, + "learning_rate": 1e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8376154899597168, + "num_tokens": 396850314.0, + "step": 10397 + }, + { + "epoch": 1.3227324767841242, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.870985746383667, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.855715274810791, + "num_tokens": 396887920.0, + "step": 10398 + }, + { + "epoch": 1.3228596870627147, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0101664066314697, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8527500033378601, + "num_tokens": 396923369.0, + "step": 10399 + }, + { + "epoch": 1.3229868973413053, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.7756009101867676, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8607521057128906, + "num_tokens": 396962338.0, + "step": 10400 + }, + { + "epoch": 1.3231141076198956, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0431180000305176, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8742804527282715, + "num_tokens": 396998275.0, + "step": 10401 + }, + { + "epoch": 1.323241317898486, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9003334045410156, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8605107069015503, + "num_tokens": 397040034.0, + "step": 10402 + }, + { + "epoch": 1.3233685281770766, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.749141812324524, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8829964995384216, + "num_tokens": 397080549.0, + "step": 10403 + }, + { + "epoch": 1.3234957384556671, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8435252904891968, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8760724067687988, + "num_tokens": 397120803.0, + "step": 10404 + }, + { + "epoch": 1.3236229487342577, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8280421495437622, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8672611117362976, + "num_tokens": 397160463.0, + "step": 10405 + }, + { + "epoch": 1.3237501590128482, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7705074548721313, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8570173382759094, + "num_tokens": 397205425.0, + "step": 10406 + }, + { + "epoch": 1.3238773692914387, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0835635662078857, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8579699397087097, + "num_tokens": 397238907.0, + "step": 10407 + }, + { + "epoch": 1.3240045795700293, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9654948711395264, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.864139199256897, + "num_tokens": 397281639.0, + "step": 10408 + }, + { + "epoch": 1.3241317898486198, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7680420875549316, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8518739938735962, + "num_tokens": 397323350.0, + "step": 10409 + }, + { + "epoch": 1.3242590001272103, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7724310159683228, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8565768003463745, + "num_tokens": 397364438.0, + "step": 10410 + }, + { + "epoch": 1.3243862104058008, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7638609409332275, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.851158857345581, + "num_tokens": 397406881.0, + "step": 10411 + }, + { + "epoch": 1.3245134206843914, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.061434268951416, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.865222692489624, + "num_tokens": 397451190.0, + "step": 10412 + }, + { + "epoch": 1.324640630962982, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9017159938812256, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8633296489715576, + "num_tokens": 397491063.0, + "step": 10413 + }, + { + "epoch": 1.3247678412415724, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.83909273147583, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8639978766441345, + "num_tokens": 397529961.0, + "step": 10414 + }, + { + "epoch": 1.324895051520163, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8423335552215576, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8738013505935669, + "num_tokens": 397565616.0, + "step": 10415 + }, + { + "epoch": 1.3250222617987533, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9814993143081665, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8694773316383362, + "num_tokens": 397602223.0, + "step": 10416 + }, + { + "epoch": 1.3251494720773438, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.076805353164673, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8420071601867676, + "num_tokens": 397638910.0, + "step": 10417 + }, + { + "epoch": 1.3252766823559343, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7771683931350708, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.856103777885437, + "num_tokens": 397681200.0, + "step": 10418 + }, + { + "epoch": 1.3254038926345248, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9752812385559082, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8648651838302612, + "num_tokens": 397724832.0, + "step": 10419 + }, + { + "epoch": 1.3255311029131154, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.917141318321228, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8562310934066772, + "num_tokens": 397765537.0, + "step": 10420 + }, + { + "epoch": 1.325658313191706, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8727755546569824, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8553926944732666, + "num_tokens": 397800945.0, + "step": 10421 + }, + { + "epoch": 1.3257855234702964, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8298420906066895, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8682892322540283, + "num_tokens": 397840786.0, + "step": 10422 + }, + { + "epoch": 1.325912733748887, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8711696863174438, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8679096698760986, + "num_tokens": 397875607.0, + "step": 10423 + }, + { + "epoch": 1.3260399440274775, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.003075361251831, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8787912726402283, + "num_tokens": 397909524.0, + "step": 10424 + }, + { + "epoch": 1.3261671543060678, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9764502048492432, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8469345569610596, + "num_tokens": 397947862.0, + "step": 10425 + }, + { + "epoch": 1.3262943645846583, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1902711391448975, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8764142990112305, + "num_tokens": 397980958.0, + "step": 10426 + }, + { + "epoch": 1.3264215748632489, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.891144037246704, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8598899841308594, + "num_tokens": 398021231.0, + "step": 10427 + }, + { + "epoch": 1.3265487851418394, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7588993310928345, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8614157438278198, + "num_tokens": 398059202.0, + "step": 10428 + }, + { + "epoch": 1.32667599542043, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.690355658531189, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8847044706344604, + "num_tokens": 398097121.0, + "step": 10429 + }, + { + "epoch": 1.3268032056990204, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9073933362960815, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8628535270690918, + "num_tokens": 398130413.0, + "step": 10430 + }, + { + "epoch": 1.326930415977611, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.788023829460144, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8614431619644165, + "num_tokens": 398172535.0, + "step": 10431 + }, + { + "epoch": 1.3270576262562015, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0716660022735596, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8644254207611084, + "num_tokens": 398206681.0, + "step": 10432 + }, + { + "epoch": 1.327184836534792, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.005829095840454, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8667067885398865, + "num_tokens": 398245206.0, + "step": 10433 + }, + { + "epoch": 1.3273120468133826, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1279516220092773, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8591845631599426, + "num_tokens": 398278902.0, + "step": 10434 + }, + { + "epoch": 1.327439257091973, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9216536283493042, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8644528388977051, + "num_tokens": 398315833.0, + "step": 10435 + }, + { + "epoch": 1.3275664673705636, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.521416187286377, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8469678163528442, + "num_tokens": 398356496.0, + "step": 10436 + }, + { + "epoch": 1.3276936776491541, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8913838863372803, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8696394562721252, + "num_tokens": 398393661.0, + "step": 10437 + }, + { + "epoch": 1.3278208879277447, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7940914630889893, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8803871870040894, + "num_tokens": 398430463.0, + "step": 10438 + }, + { + "epoch": 1.3279480982063352, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8944334983825684, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8628215193748474, + "num_tokens": 398470382.0, + "step": 10439 + }, + { + "epoch": 1.3280753084849257, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.124446153640747, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8461370468139648, + "num_tokens": 398504793.0, + "step": 10440 + }, + { + "epoch": 1.328202518763516, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9233993291854858, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8553133010864258, + "num_tokens": 398541038.0, + "step": 10441 + }, + { + "epoch": 1.3283297290421066, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.896180272102356, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8466251492500305, + "num_tokens": 398578047.0, + "step": 10442 + }, + { + "epoch": 1.328456939320697, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.867364525794983, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8509177565574646, + "num_tokens": 398618946.0, + "step": 10443 + }, + { + "epoch": 1.3285841495992876, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.016152858734131, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8673071265220642, + "num_tokens": 398656895.0, + "step": 10444 + }, + { + "epoch": 1.3287113598778781, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8905956745147705, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.864124059677124, + "num_tokens": 398696521.0, + "step": 10445 + }, + { + "epoch": 1.3288385701564687, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8983142375946045, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8646411895751953, + "num_tokens": 398735275.0, + "step": 10446 + }, + { + "epoch": 1.3289657804350592, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9076114892959595, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.848163902759552, + "num_tokens": 398774753.0, + "step": 10447 + }, + { + "epoch": 1.3290929907136497, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8347094058990479, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8570353388786316, + "num_tokens": 398810893.0, + "step": 10448 + }, + { + "epoch": 1.3292202009922403, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8723121881484985, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8581070899963379, + "num_tokens": 398849741.0, + "step": 10449 + }, + { + "epoch": 1.3293474112708306, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.119737386703491, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8570876717567444, + "num_tokens": 398881186.0, + "step": 10450 + }, + { + "epoch": 1.329474621549421, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.853147029876709, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8801649212837219, + "num_tokens": 398918677.0, + "step": 10451 + }, + { + "epoch": 1.3296018318280116, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.546865701675415, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8610879182815552, + "num_tokens": 398953435.0, + "step": 10452 + }, + { + "epoch": 1.3297290421066021, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.89829683303833, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8698563575744629, + "num_tokens": 398991799.0, + "step": 10453 + }, + { + "epoch": 1.3298562523851927, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9378825426101685, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8664729595184326, + "num_tokens": 399023211.0, + "step": 10454 + }, + { + "epoch": 1.3299834626637832, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9687002897262573, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8586571216583252, + "num_tokens": 399061431.0, + "step": 10455 + }, + { + "epoch": 1.3301106729423737, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8901797533035278, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8642826080322266, + "num_tokens": 399097384.0, + "step": 10456 + }, + { + "epoch": 1.3302378832209643, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9587472677230835, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8655014634132385, + "num_tokens": 399133935.0, + "step": 10457 + }, + { + "epoch": 1.3303650934995548, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0826053619384766, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8683618307113647, + "num_tokens": 399165709.0, + "step": 10458 + }, + { + "epoch": 1.3304923037781453, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8824394941329956, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8604097366333008, + "num_tokens": 399203548.0, + "step": 10459 + }, + { + "epoch": 1.3306195140567358, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8479163646697998, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8616254329681396, + "num_tokens": 399244341.0, + "step": 10460 + }, + { + "epoch": 1.3307467243353264, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8540544509887695, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.868547797203064, + "num_tokens": 399282284.0, + "step": 10461 + }, + { + "epoch": 1.330873934613917, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7633891105651855, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8767930269241333, + "num_tokens": 399320523.0, + "step": 10462 + }, + { + "epoch": 1.3310011448925074, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8225281238555908, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8646054267883301, + "num_tokens": 399357489.0, + "step": 10463 + }, + { + "epoch": 1.331128355171098, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.141892671585083, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8514392375946045, + "num_tokens": 399393561.0, + "step": 10464 + }, + { + "epoch": 1.3312555654496883, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0354437828063965, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8677037954330444, + "num_tokens": 399424514.0, + "step": 10465 + }, + { + "epoch": 1.3313827757282788, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9132168292999268, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8683167695999146, + "num_tokens": 399461372.0, + "step": 10466 + }, + { + "epoch": 1.3315099860068693, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7963659763336182, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8774910569190979, + "num_tokens": 399502503.0, + "step": 10467 + }, + { + "epoch": 1.3316371962854598, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.210446357727051, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8572899699211121, + "num_tokens": 399537836.0, + "step": 10468 + }, + { + "epoch": 1.3317644065640504, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1835832595825195, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.858444094657898, + "num_tokens": 399575189.0, + "step": 10469 + }, + { + "epoch": 1.331891616842641, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0388894081115723, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.844292163848877, + "num_tokens": 399614406.0, + "step": 10470 + }, + { + "epoch": 1.3320188271212314, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8744213581085205, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8688614368438721, + "num_tokens": 399652847.0, + "step": 10471 + }, + { + "epoch": 1.332146037399822, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9839433431625366, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8528693914413452, + "num_tokens": 399691628.0, + "step": 10472 + }, + { + "epoch": 1.3322732476784125, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.779610276222229, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8745640516281128, + "num_tokens": 399732875.0, + "step": 10473 + }, + { + "epoch": 1.3324004579570028, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0437607765197754, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8736259937286377, + "num_tokens": 399769537.0, + "step": 10474 + }, + { + "epoch": 1.3325276682355933, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0485618114471436, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8474843502044678, + "num_tokens": 399807757.0, + "step": 10475 + }, + { + "epoch": 1.3326548785141838, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.9247686862945557, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8702985048294067, + "num_tokens": 399837351.0, + "step": 10476 + }, + { + "epoch": 1.3327820887927744, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0408480167388916, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8823578357696533, + "num_tokens": 399873678.0, + "step": 10477 + }, + { + "epoch": 1.332909299071365, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.84309720993042, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8649728298187256, + "num_tokens": 399916206.0, + "step": 10478 + }, + { + "epoch": 1.3330365093499554, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9683204889297485, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8751412034034729, + "num_tokens": 399954144.0, + "step": 10479 + }, + { + "epoch": 1.333163719628546, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7327011823654175, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8715953230857849, + "num_tokens": 399995865.0, + "step": 10480 + }, + { + "epoch": 1.3332909299071365, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8932453393936157, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8646638989448547, + "num_tokens": 400033234.0, + "step": 10481 + }, + { + "epoch": 1.333418140185727, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9737348556518555, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8720650672912598, + "num_tokens": 400066842.0, + "step": 10482 + }, + { + "epoch": 1.3335453504643175, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9540541172027588, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8732731342315674, + "num_tokens": 400099380.0, + "step": 10483 + }, + { + "epoch": 1.333672560742908, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9261940717697144, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8755543828010559, + "num_tokens": 400137419.0, + "step": 10484 + }, + { + "epoch": 1.3337997710214986, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9901771545410156, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8638741970062256, + "num_tokens": 400171086.0, + "step": 10485 + }, + { + "epoch": 1.3339269813000891, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 3.9664382934570312, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8617174625396729, + "num_tokens": 400206763.0, + "step": 10486 + }, + { + "epoch": 1.3340541915786797, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0404539108276367, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.848595380783081, + "num_tokens": 400242708.0, + "step": 10487 + }, + { + "epoch": 1.3341814018572702, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9784010648727417, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8431857228279114, + "num_tokens": 400277417.0, + "step": 10488 + }, + { + "epoch": 1.3343086121358605, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9820536375045776, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8617367148399353, + "num_tokens": 400315811.0, + "step": 10489 + }, + { + "epoch": 1.334435822414451, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.097724437713623, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8448163270950317, + "num_tokens": 400356321.0, + "step": 10490 + }, + { + "epoch": 1.3345630326930416, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1776106357574463, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8619574904441833, + "num_tokens": 400392382.0, + "step": 10491 + }, + { + "epoch": 1.334690242971632, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.788788914680481, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8715932369232178, + "num_tokens": 400434947.0, + "step": 10492 + }, + { + "epoch": 1.3348174532502226, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.928810477256775, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8449261784553528, + "num_tokens": 400472742.0, + "step": 10493 + }, + { + "epoch": 1.3349446635288131, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7793716192245483, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8770219087600708, + "num_tokens": 400510029.0, + "step": 10494 + }, + { + "epoch": 1.3350718738074037, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0366504192352295, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8599530458450317, + "num_tokens": 400547383.0, + "step": 10495 + }, + { + "epoch": 1.3351990840859942, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9866987466812134, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8745973110198975, + "num_tokens": 400585242.0, + "step": 10496 + }, + { + "epoch": 1.3353262943645847, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.81509268283844, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8757113218307495, + "num_tokens": 400623117.0, + "step": 10497 + }, + { + "epoch": 1.3354535046431752, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8626059293746948, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8537522554397583, + "num_tokens": 400660170.0, + "step": 10498 + }, + { + "epoch": 1.3355807149217656, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.98214852809906, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8703248500823975, + "num_tokens": 400700467.0, + "step": 10499 + }, + { + "epoch": 1.335707925200356, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9498525857925415, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8572956323623657, + "num_tokens": 400740269.0, + "step": 10500 + }, + { + "epoch": 1.3358351354789466, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9710910320281982, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.873711884021759, + "num_tokens": 400776465.0, + "step": 10501 + }, + { + "epoch": 1.3359623457575371, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9285489320755005, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8588118553161621, + "num_tokens": 400811210.0, + "step": 10502 + }, + { + "epoch": 1.3360895560361277, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9668693542480469, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8558171391487122, + "num_tokens": 400848820.0, + "step": 10503 + }, + { + "epoch": 1.3362167663147182, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8965575695037842, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8657223582267761, + "num_tokens": 400888774.0, + "step": 10504 + }, + { + "epoch": 1.3363439765933087, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7397290468215942, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8655291795730591, + "num_tokens": 400935067.0, + "step": 10505 + }, + { + "epoch": 1.3364711868718993, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.078843116760254, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8730568885803223, + "num_tokens": 400974122.0, + "step": 10506 + }, + { + "epoch": 1.3365983971504898, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8857805728912354, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8727903962135315, + "num_tokens": 401008911.0, + "step": 10507 + }, + { + "epoch": 1.3367256074290803, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0129921436309814, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8695656657218933, + "num_tokens": 401039453.0, + "step": 10508 + }, + { + "epoch": 1.3368528177076708, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7943331003189087, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8732708096504211, + "num_tokens": 401075854.0, + "step": 10509 + }, + { + "epoch": 1.3369800279862614, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9505547285079956, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8632258176803589, + "num_tokens": 401114922.0, + "step": 10510 + }, + { + "epoch": 1.337107238264852, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7695952653884888, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8721269369125366, + "num_tokens": 401155833.0, + "step": 10511 + }, + { + "epoch": 1.3372344485434424, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8201000690460205, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8699204921722412, + "num_tokens": 401196971.0, + "step": 10512 + }, + { + "epoch": 1.337361658822033, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.918932318687439, + "learning_rate": 1e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8311699628829956, + "num_tokens": 401237312.0, + "step": 10513 + }, + { + "epoch": 1.3374888691006233, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0034308433532715, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8606588840484619, + "num_tokens": 401274630.0, + "step": 10514 + }, + { + "epoch": 1.3376160793792138, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9186071157455444, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8688201904296875, + "num_tokens": 401311780.0, + "step": 10515 + }, + { + "epoch": 1.3377432896578043, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8736236095428467, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8709729909896851, + "num_tokens": 401352614.0, + "step": 10516 + }, + { + "epoch": 1.3378704999363948, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8960779905319214, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8665163516998291, + "num_tokens": 401394495.0, + "step": 10517 + }, + { + "epoch": 1.3379977102149854, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 5.176486492156982, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8497178554534912, + "num_tokens": 401427758.0, + "step": 10518 + }, + { + "epoch": 1.338124920493576, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.948957920074463, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.848282516002655, + "num_tokens": 401473868.0, + "step": 10519 + }, + { + "epoch": 1.3382521307721664, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0224220752716064, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.846845269203186, + "num_tokens": 401510459.0, + "step": 10520 + }, + { + "epoch": 1.338379341050757, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.013932943344116, + "learning_rate": 1e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.8348150253295898, + "num_tokens": 401545283.0, + "step": 10521 + }, + { + "epoch": 1.3385065513293475, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1210854053497314, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8644205927848816, + "num_tokens": 401577166.0, + "step": 10522 + }, + { + "epoch": 1.3386337616079378, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7149335145950317, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8757497072219849, + "num_tokens": 401619075.0, + "step": 10523 + }, + { + "epoch": 1.3387609718865283, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9042154550552368, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8569591641426086, + "num_tokens": 401661286.0, + "step": 10524 + }, + { + "epoch": 1.3388881821651188, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9055311679840088, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8553297519683838, + "num_tokens": 401699672.0, + "step": 10525 + }, + { + "epoch": 1.3390153924437094, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9916304349899292, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8567467927932739, + "num_tokens": 401732676.0, + "step": 10526 + }, + { + "epoch": 1.3391426027223, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8024635314941406, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8617855906486511, + "num_tokens": 401771593.0, + "step": 10527 + }, + { + "epoch": 1.3392698130008904, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9818975925445557, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8496966361999512, + "num_tokens": 401809605.0, + "step": 10528 + }, + { + "epoch": 1.339397023279481, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9254971742630005, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8549943566322327, + "num_tokens": 401845349.0, + "step": 10529 + }, + { + "epoch": 1.3395242335580715, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7573163509368896, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8858497738838196, + "num_tokens": 401883186.0, + "step": 10530 + }, + { + "epoch": 1.339651443836662, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0741448402404785, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8488070964813232, + "num_tokens": 401921149.0, + "step": 10531 + }, + { + "epoch": 1.3397786541152525, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8267943859100342, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.856552004814148, + "num_tokens": 401961813.0, + "step": 10532 + }, + { + "epoch": 1.339905864393843, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.27183198928833, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8464982509613037, + "num_tokens": 402001212.0, + "step": 10533 + }, + { + "epoch": 1.3400330746724336, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7178442478179932, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.857695460319519, + "num_tokens": 402044648.0, + "step": 10534 + }, + { + "epoch": 1.3401602849510241, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7789888381958008, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8658016920089722, + "num_tokens": 402087391.0, + "step": 10535 + }, + { + "epoch": 1.3402874952296147, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8687670230865479, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8833827972412109, + "num_tokens": 402119598.0, + "step": 10536 + }, + { + "epoch": 1.3404147055082052, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0338551998138428, + "learning_rate": 1e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8383247256278992, + "num_tokens": 402153628.0, + "step": 10537 + }, + { + "epoch": 1.3405419157867955, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0481088161468506, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8796070218086243, + "num_tokens": 402190876.0, + "step": 10538 + }, + { + "epoch": 1.340669126065386, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.780069351196289, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8716875314712524, + "num_tokens": 402227168.0, + "step": 10539 + }, + { + "epoch": 1.3407963363439765, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9361587762832642, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8721295595169067, + "num_tokens": 402262731.0, + "step": 10540 + }, + { + "epoch": 1.340923546622567, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.5167181491851807, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8647810816764832, + "num_tokens": 402299992.0, + "step": 10541 + }, + { + "epoch": 1.3410507569011576, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0880861282348633, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8569279313087463, + "num_tokens": 402339129.0, + "step": 10542 + }, + { + "epoch": 1.3411779671797481, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9381202459335327, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8460683226585388, + "num_tokens": 402375380.0, + "step": 10543 + }, + { + "epoch": 1.3413051774583387, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9455009698867798, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8523540496826172, + "num_tokens": 402412159.0, + "step": 10544 + }, + { + "epoch": 1.3414323877369292, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0459372997283936, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8597559928894043, + "num_tokens": 402446676.0, + "step": 10545 + }, + { + "epoch": 1.3415595980155197, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.670491099357605, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8655470609664917, + "num_tokens": 402490741.0, + "step": 10546 + }, + { + "epoch": 1.3416868082941102, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.982588529586792, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8733837604522705, + "num_tokens": 402527398.0, + "step": 10547 + }, + { + "epoch": 1.3418140185727006, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9146679639816284, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8628029823303223, + "num_tokens": 402565914.0, + "step": 10548 + }, + { + "epoch": 1.341941228851291, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9012762308120728, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8665404319763184, + "num_tokens": 402600671.0, + "step": 10549 + }, + { + "epoch": 1.3420684391298816, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8461860418319702, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8757973909378052, + "num_tokens": 402637381.0, + "step": 10550 + }, + { + "epoch": 1.3421956494084721, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9058880805969238, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8674391508102417, + "num_tokens": 402671190.0, + "step": 10551 + }, + { + "epoch": 1.3423228596870627, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7613266706466675, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8633309602737427, + "num_tokens": 402721518.0, + "step": 10552 + }, + { + "epoch": 1.3424500699656532, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.218247175216675, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.863426923751831, + "num_tokens": 402758949.0, + "step": 10553 + }, + { + "epoch": 1.3425772802442437, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0571703910827637, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8577584028244019, + "num_tokens": 402794342.0, + "step": 10554 + }, + { + "epoch": 1.3427044905228342, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.761550784111023, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8753944635391235, + "num_tokens": 402831031.0, + "step": 10555 + }, + { + "epoch": 1.3428317008014248, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6911275386810303, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8738502264022827, + "num_tokens": 402871486.0, + "step": 10556 + }, + { + "epoch": 1.3429589110800153, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1187796592712402, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8758176565170288, + "num_tokens": 402916024.0, + "step": 10557 + }, + { + "epoch": 1.3430861213586058, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2387497425079346, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8460086584091187, + "num_tokens": 402952116.0, + "step": 10558 + }, + { + "epoch": 1.3432133316371964, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9052846431732178, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8622572422027588, + "num_tokens": 402998064.0, + "step": 10559 + }, + { + "epoch": 1.343340541915787, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.758428931236267, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8681107759475708, + "num_tokens": 403041763.0, + "step": 10560 + }, + { + "epoch": 1.3434677521943774, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2898731231689453, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8748819828033447, + "num_tokens": 403080956.0, + "step": 10561 + }, + { + "epoch": 1.343594962472968, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5193634033203125, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8725104331970215, + "num_tokens": 403117200.0, + "step": 10562 + }, + { + "epoch": 1.3437221727515583, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.075716733932495, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8639328479766846, + "num_tokens": 403154112.0, + "step": 10563 + }, + { + "epoch": 1.3438493830301488, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8577219247817993, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8622524738311768, + "num_tokens": 403188265.0, + "step": 10564 + }, + { + "epoch": 1.3439765933087393, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.829026460647583, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8410847187042236, + "num_tokens": 403231236.0, + "step": 10565 + }, + { + "epoch": 1.3441038035873298, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9259885549545288, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8658851385116577, + "num_tokens": 403268642.0, + "step": 10566 + }, + { + "epoch": 1.3442310138659204, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7921956777572632, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8718589544296265, + "num_tokens": 403307679.0, + "step": 10567 + }, + { + "epoch": 1.344358224144511, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.771005630493164, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8695217967033386, + "num_tokens": 403346395.0, + "step": 10568 + }, + { + "epoch": 1.3444854344231014, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.940140962600708, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.871584415435791, + "num_tokens": 403383001.0, + "step": 10569 + }, + { + "epoch": 1.344612644701692, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7573764324188232, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8553972244262695, + "num_tokens": 403426090.0, + "step": 10570 + }, + { + "epoch": 1.3447398549802825, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8650431632995605, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8572385311126709, + "num_tokens": 403470231.0, + "step": 10571 + }, + { + "epoch": 1.3448670652588728, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8673741817474365, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.869198739528656, + "num_tokens": 403508677.0, + "step": 10572 + }, + { + "epoch": 1.3449942755374633, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1011626720428467, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8440799117088318, + "num_tokens": 403545175.0, + "step": 10573 + }, + { + "epoch": 1.3451214858160538, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8264412879943848, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8778189420700073, + "num_tokens": 403583321.0, + "step": 10574 + }, + { + "epoch": 1.3452486960946444, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8821887969970703, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8566358089447021, + "num_tokens": 403618547.0, + "step": 10575 + }, + { + "epoch": 1.345375906373235, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 16.62535285949707, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.866490364074707, + "num_tokens": 403657552.0, + "step": 10576 + }, + { + "epoch": 1.3455031166518254, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.244450807571411, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8664871454238892, + "num_tokens": 403697021.0, + "step": 10577 + }, + { + "epoch": 1.345630326930416, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.128680467605591, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8530579805374146, + "num_tokens": 403736480.0, + "step": 10578 + }, + { + "epoch": 1.3457575372090065, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.010660171508789, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8712477684020996, + "num_tokens": 403772056.0, + "step": 10579 + }, + { + "epoch": 1.345884747487597, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8149467706680298, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8586554527282715, + "num_tokens": 403813646.0, + "step": 10580 + }, + { + "epoch": 1.3460119577661875, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7789170742034912, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8674193024635315, + "num_tokens": 403853904.0, + "step": 10581 + }, + { + "epoch": 1.346139168044778, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8023041486740112, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8470972776412964, + "num_tokens": 403898528.0, + "step": 10582 + }, + { + "epoch": 1.3462663783233686, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.305793523788452, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8496283888816833, + "num_tokens": 403929518.0, + "step": 10583 + }, + { + "epoch": 1.3463935886019591, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.941323161125183, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8661398887634277, + "num_tokens": 403970323.0, + "step": 10584 + }, + { + "epoch": 1.3465207988805497, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7247415781021118, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8684187531471252, + "num_tokens": 404012499.0, + "step": 10585 + }, + { + "epoch": 1.3466480091591402, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8178784847259521, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8591737747192383, + "num_tokens": 404055370.0, + "step": 10586 + }, + { + "epoch": 1.3467752194377305, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.821763038635254, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8654775023460388, + "num_tokens": 404092498.0, + "step": 10587 + }, + { + "epoch": 1.346902429716321, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.901856780052185, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.869385302066803, + "num_tokens": 404132466.0, + "step": 10588 + }, + { + "epoch": 1.3470296399949115, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.756084680557251, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8640430569648743, + "num_tokens": 404171913.0, + "step": 10589 + }, + { + "epoch": 1.347156850273502, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.99920654296875, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8773481845855713, + "num_tokens": 404205817.0, + "step": 10590 + }, + { + "epoch": 1.3472840605520926, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8841768503189087, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8680820465087891, + "num_tokens": 404243922.0, + "step": 10591 + }, + { + "epoch": 1.3474112708306831, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3492794036865234, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.865798830986023, + "num_tokens": 404283392.0, + "step": 10592 + }, + { + "epoch": 1.3475384811092737, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0531795024871826, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8379979133605957, + "num_tokens": 404323044.0, + "step": 10593 + }, + { + "epoch": 1.3476656913878642, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.966371774673462, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8697234392166138, + "num_tokens": 404359761.0, + "step": 10594 + }, + { + "epoch": 1.3477929016664547, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0519652366638184, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8504717946052551, + "num_tokens": 404395354.0, + "step": 10595 + }, + { + "epoch": 1.3479201119450452, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0282347202301025, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8477341532707214, + "num_tokens": 404432052.0, + "step": 10596 + }, + { + "epoch": 1.3480473222236355, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0106594562530518, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8561148643493652, + "num_tokens": 404472689.0, + "step": 10597 + }, + { + "epoch": 1.348174532502226, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7836849689483643, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8483376502990723, + "num_tokens": 404515916.0, + "step": 10598 + }, + { + "epoch": 1.3483017427808166, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9205042123794556, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8616839647293091, + "num_tokens": 404557399.0, + "step": 10599 + }, + { + "epoch": 1.3484289530594071, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9823148250579834, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8598966002464294, + "num_tokens": 404592573.0, + "step": 10600 + }, + { + "epoch": 1.3485561633379977, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8592687845230103, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8608183860778809, + "num_tokens": 404635266.0, + "step": 10601 + }, + { + "epoch": 1.3486833736165882, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7724581956863403, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8717244863510132, + "num_tokens": 404673004.0, + "step": 10602 + }, + { + "epoch": 1.3488105838951787, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.9305930137634277, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8474755883216858, + "num_tokens": 404705153.0, + "step": 10603 + }, + { + "epoch": 1.3489377941737692, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.7986459732055664, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8540108799934387, + "num_tokens": 404737068.0, + "step": 10604 + }, + { + "epoch": 1.3490650044523598, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0476787090301514, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.843774139881134, + "num_tokens": 404773680.0, + "step": 10605 + }, + { + "epoch": 1.3491922147309503, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8874233961105347, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8696377277374268, + "num_tokens": 404809937.0, + "step": 10606 + }, + { + "epoch": 1.3493194250095408, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0583293437957764, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8548958897590637, + "num_tokens": 404847146.0, + "step": 10607 + }, + { + "epoch": 1.3494466352881314, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9873830080032349, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.864233672618866, + "num_tokens": 404882952.0, + "step": 10608 + }, + { + "epoch": 1.3495738455667219, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6705430746078491, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.873609185218811, + "num_tokens": 404925347.0, + "step": 10609 + }, + { + "epoch": 1.3497010558453124, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8642200231552124, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.849069356918335, + "num_tokens": 404963411.0, + "step": 10610 + }, + { + "epoch": 1.349828266123903, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8396034240722656, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8616042733192444, + "num_tokens": 404997522.0, + "step": 10611 + }, + { + "epoch": 1.3499554764024932, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8874517679214478, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8608791828155518, + "num_tokens": 405034049.0, + "step": 10612 + }, + { + "epoch": 1.3500826866810838, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8780633211135864, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.860243558883667, + "num_tokens": 405074985.0, + "step": 10613 + }, + { + "epoch": 1.3502098969596743, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7877976894378662, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8786962032318115, + "num_tokens": 405116133.0, + "step": 10614 + }, + { + "epoch": 1.3503371072382648, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7302258014678955, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.861491858959198, + "num_tokens": 405155572.0, + "step": 10615 + }, + { + "epoch": 1.3504643175168554, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9545228481292725, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8660285472869873, + "num_tokens": 405194700.0, + "step": 10616 + }, + { + "epoch": 1.350591527795446, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9356828927993774, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.858123242855072, + "num_tokens": 405233582.0, + "step": 10617 + }, + { + "epoch": 1.3507187380740364, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8144258260726929, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8745391964912415, + "num_tokens": 405274987.0, + "step": 10618 + }, + { + "epoch": 1.350845948352627, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9131797552108765, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8605362176895142, + "num_tokens": 405308151.0, + "step": 10619 + }, + { + "epoch": 1.3509731586312175, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9019863605499268, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8625620603561401, + "num_tokens": 405343939.0, + "step": 10620 + }, + { + "epoch": 1.3511003689098078, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7965724468231201, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8625844717025757, + "num_tokens": 405384211.0, + "step": 10621 + }, + { + "epoch": 1.3512275791883983, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9049882888793945, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.869849443435669, + "num_tokens": 405419971.0, + "step": 10622 + }, + { + "epoch": 1.3513547894669888, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.125793218612671, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8565560579299927, + "num_tokens": 405451322.0, + "step": 10623 + }, + { + "epoch": 1.3514819997455794, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.809968948364258, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8544880151748657, + "num_tokens": 405496101.0, + "step": 10624 + }, + { + "epoch": 1.35160921002417, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0569052696228027, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8468033075332642, + "num_tokens": 405530034.0, + "step": 10625 + }, + { + "epoch": 1.3517364203027604, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7766472101211548, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8749384880065918, + "num_tokens": 405570893.0, + "step": 10626 + }, + { + "epoch": 1.351863630581351, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.787265419960022, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8503459692001343, + "num_tokens": 405610120.0, + "step": 10627 + }, + { + "epoch": 1.3519908408599415, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 16.592761993408203, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8625667095184326, + "num_tokens": 405651878.0, + "step": 10628 + }, + { + "epoch": 1.352118051138532, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8688406944274902, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8676853179931641, + "num_tokens": 405694578.0, + "step": 10629 + }, + { + "epoch": 1.3522452614171225, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8789207935333252, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8687285780906677, + "num_tokens": 405734128.0, + "step": 10630 + }, + { + "epoch": 1.352372471695713, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8364499807357788, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8679542541503906, + "num_tokens": 405776588.0, + "step": 10631 + }, + { + "epoch": 1.3524996819743036, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1062543392181396, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.859612226486206, + "num_tokens": 405810867.0, + "step": 10632 + }, + { + "epoch": 1.3526268922528941, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7401719093322754, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.880409836769104, + "num_tokens": 405848884.0, + "step": 10633 + }, + { + "epoch": 1.3527541025314846, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7326548099517822, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8809823989868164, + "num_tokens": 405888759.0, + "step": 10634 + }, + { + "epoch": 1.3528813128100752, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8825304508209229, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.876008152961731, + "num_tokens": 405925686.0, + "step": 10635 + }, + { + "epoch": 1.3530085230886655, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1522552967071533, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8776818513870239, + "num_tokens": 405960836.0, + "step": 10636 + }, + { + "epoch": 1.353135733367256, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7562559843063354, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8741791248321533, + "num_tokens": 406000314.0, + "step": 10637 + }, + { + "epoch": 1.3532629436458465, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8255665302276611, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8734719753265381, + "num_tokens": 406039468.0, + "step": 10638 + }, + { + "epoch": 1.353390153924437, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7704259157180786, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8871316313743591, + "num_tokens": 406076828.0, + "step": 10639 + }, + { + "epoch": 1.3535173642030276, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9166038036346436, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8624647259712219, + "num_tokens": 406116504.0, + "step": 10640 + }, + { + "epoch": 1.3536445744816181, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.063905715942383, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8620113730430603, + "num_tokens": 406155184.0, + "step": 10641 + }, + { + "epoch": 1.3537717847602087, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9359114170074463, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8440738916397095, + "num_tokens": 406195411.0, + "step": 10642 + }, + { + "epoch": 1.3538989950387992, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1290090084075928, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8545200824737549, + "num_tokens": 406230159.0, + "step": 10643 + }, + { + "epoch": 1.3540262053173897, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9777714014053345, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8527967929840088, + "num_tokens": 406264541.0, + "step": 10644 + }, + { + "epoch": 1.3541534155959802, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8750221729278564, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.870040774345398, + "num_tokens": 406301869.0, + "step": 10645 + }, + { + "epoch": 1.3542806258745705, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9726150035858154, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8652398586273193, + "num_tokens": 406338819.0, + "step": 10646 + }, + { + "epoch": 1.354407836153161, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9310544729232788, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.864585280418396, + "num_tokens": 406375914.0, + "step": 10647 + }, + { + "epoch": 1.3545350464317516, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9296650886535645, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8687839508056641, + "num_tokens": 406418543.0, + "step": 10648 + }, + { + "epoch": 1.3546622567103421, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8933959007263184, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8606294393539429, + "num_tokens": 406455731.0, + "step": 10649 + }, + { + "epoch": 1.3547894669889327, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8483117818832397, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8629323244094849, + "num_tokens": 406496266.0, + "step": 10650 + }, + { + "epoch": 1.3549166772675232, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8359901905059814, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8584597706794739, + "num_tokens": 406541676.0, + "step": 10651 + }, + { + "epoch": 1.3550438875461137, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.826709270477295, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8594036102294922, + "num_tokens": 406578997.0, + "step": 10652 + }, + { + "epoch": 1.3551710978247042, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8029074668884277, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8678259253501892, + "num_tokens": 406619404.0, + "step": 10653 + }, + { + "epoch": 1.3552983081032948, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8149683475494385, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8820613026618958, + "num_tokens": 406656092.0, + "step": 10654 + }, + { + "epoch": 1.3554255183818853, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9122394323349, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8471205234527588, + "num_tokens": 406701600.0, + "step": 10655 + }, + { + "epoch": 1.3555527286604758, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8978303670883179, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8625791072845459, + "num_tokens": 406739649.0, + "step": 10656 + }, + { + "epoch": 1.3556799389390664, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1048977375030518, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.883357048034668, + "num_tokens": 406774962.0, + "step": 10657 + }, + { + "epoch": 1.3558071492176569, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8691487312316895, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8598860502243042, + "num_tokens": 406818047.0, + "step": 10658 + }, + { + "epoch": 1.3559343594962474, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.545400857925415, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8557222485542297, + "num_tokens": 406859464.0, + "step": 10659 + }, + { + "epoch": 1.356061569774838, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8759605884552002, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8627626299858093, + "num_tokens": 406896444.0, + "step": 10660 + }, + { + "epoch": 1.3561887800534282, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.932986855506897, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8574410676956177, + "num_tokens": 406934623.0, + "step": 10661 + }, + { + "epoch": 1.3563159903320188, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7543158531188965, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8557962775230408, + "num_tokens": 406974290.0, + "step": 10662 + }, + { + "epoch": 1.3564432006106093, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7759531736373901, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.881050705909729, + "num_tokens": 407011279.0, + "step": 10663 + }, + { + "epoch": 1.3565704108891998, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8411033153533936, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8530688285827637, + "num_tokens": 407054733.0, + "step": 10664 + }, + { + "epoch": 1.3566976211677904, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9585790634155273, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8636587262153625, + "num_tokens": 407098410.0, + "step": 10665 + }, + { + "epoch": 1.3568248314463809, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 3.1221532821655273, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8574427366256714, + "num_tokens": 407134231.0, + "step": 10666 + }, + { + "epoch": 1.3569520417249714, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.026481866836548, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8486520051956177, + "num_tokens": 407167231.0, + "step": 10667 + }, + { + "epoch": 1.357079252003562, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8466895818710327, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8645531535148621, + "num_tokens": 407206604.0, + "step": 10668 + }, + { + "epoch": 1.3572064622821525, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8004567623138428, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8658702373504639, + "num_tokens": 407248247.0, + "step": 10669 + }, + { + "epoch": 1.3573336725607428, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3217146396636963, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8740352392196655, + "num_tokens": 407282870.0, + "step": 10670 + }, + { + "epoch": 1.3574608828393333, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9060258865356445, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8693062663078308, + "num_tokens": 407318704.0, + "step": 10671 + }, + { + "epoch": 1.3575880931179238, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.908747673034668, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8659685850143433, + "num_tokens": 407360003.0, + "step": 10672 + }, + { + "epoch": 1.3577153033965144, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 16.616865158081055, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8573155403137207, + "num_tokens": 407399473.0, + "step": 10673 + }, + { + "epoch": 1.357842513675105, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.147397756576538, + "learning_rate": 1e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.8328309059143066, + "num_tokens": 407440576.0, + "step": 10674 + }, + { + "epoch": 1.3579697239536954, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9630401134490967, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8604048490524292, + "num_tokens": 407481057.0, + "step": 10675 + }, + { + "epoch": 1.358096934232286, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9148768186569214, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8755497336387634, + "num_tokens": 407517104.0, + "step": 10676 + }, + { + "epoch": 1.3582241445108765, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8980319499969482, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8697516918182373, + "num_tokens": 407552604.0, + "step": 10677 + }, + { + "epoch": 1.358351354789467, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8238526582717896, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8780990242958069, + "num_tokens": 407588282.0, + "step": 10678 + }, + { + "epoch": 1.3584785650680575, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.170232057571411, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8585904240608215, + "num_tokens": 407619837.0, + "step": 10679 + }, + { + "epoch": 1.358605775346648, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8405427932739258, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8602354526519775, + "num_tokens": 407656939.0, + "step": 10680 + }, + { + "epoch": 1.3587329856252386, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.062358856201172, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.853781521320343, + "num_tokens": 407702756.0, + "step": 10681 + }, + { + "epoch": 1.3588601959038291, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.882071614265442, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8722621202468872, + "num_tokens": 407736142.0, + "step": 10682 + }, + { + "epoch": 1.3589874061824196, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8184130191802979, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.860622763633728, + "num_tokens": 407773184.0, + "step": 10683 + }, + { + "epoch": 1.3591146164610102, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9790756702423096, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8459988832473755, + "num_tokens": 407806543.0, + "step": 10684 + }, + { + "epoch": 1.3592418267396005, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8479701280593872, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8613770008087158, + "num_tokens": 407841371.0, + "step": 10685 + }, + { + "epoch": 1.359369037018191, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9783326387405396, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8661859035491943, + "num_tokens": 407879862.0, + "step": 10686 + }, + { + "epoch": 1.3594962472967815, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.010754346847534, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8571619987487793, + "num_tokens": 407917947.0, + "step": 10687 + }, + { + "epoch": 1.359623457575372, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.93593168258667, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8703770041465759, + "num_tokens": 407953987.0, + "step": 10688 + }, + { + "epoch": 1.3597506678539626, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9974157810211182, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.846966028213501, + "num_tokens": 407993166.0, + "step": 10689 + }, + { + "epoch": 1.3598778781325531, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.756520390510559, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8728788495063782, + "num_tokens": 408037063.0, + "step": 10690 + }, + { + "epoch": 1.3600050884111436, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.761751413345337, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8669527769088745, + "num_tokens": 408078526.0, + "step": 10691 + }, + { + "epoch": 1.3601322986897342, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8512132167816162, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8708713054656982, + "num_tokens": 408116002.0, + "step": 10692 + }, + { + "epoch": 1.3602595089683247, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8938857316970825, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8592809438705444, + "num_tokens": 408155885.0, + "step": 10693 + }, + { + "epoch": 1.3603867192469152, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.008263111114502, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8687010407447815, + "num_tokens": 408190085.0, + "step": 10694 + }, + { + "epoch": 1.3605139295255055, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9590708017349243, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.864780843257904, + "num_tokens": 408227289.0, + "step": 10695 + }, + { + "epoch": 1.360641139804096, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7192047834396362, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8712271451950073, + "num_tokens": 408271853.0, + "step": 10696 + }, + { + "epoch": 1.3607683500826866, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7223386764526367, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8773868083953857, + "num_tokens": 408311133.0, + "step": 10697 + }, + { + "epoch": 1.3608955603612771, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1472361087799072, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8645452260971069, + "num_tokens": 408342775.0, + "step": 10698 + }, + { + "epoch": 1.3610227706398677, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.802831768989563, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8764479160308838, + "num_tokens": 408379063.0, + "step": 10699 + }, + { + "epoch": 1.3611499809184582, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.921894907951355, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8500070571899414, + "num_tokens": 408415856.0, + "step": 10700 + }, + { + "epoch": 1.3612771911970487, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 7.785998821258545, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8686552047729492, + "num_tokens": 408451522.0, + "step": 10701 + }, + { + "epoch": 1.3614044014756392, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9670885801315308, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8692125678062439, + "num_tokens": 408494605.0, + "step": 10702 + }, + { + "epoch": 1.3615316117542298, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9385380744934082, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8631749749183655, + "num_tokens": 408536373.0, + "step": 10703 + }, + { + "epoch": 1.3616588220328203, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.876182198524475, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8512260913848877, + "num_tokens": 408574724.0, + "step": 10704 + }, + { + "epoch": 1.3617860323114108, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9610121250152588, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8562504053115845, + "num_tokens": 408615057.0, + "step": 10705 + }, + { + "epoch": 1.3619132425900013, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0276992321014404, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8521612882614136, + "num_tokens": 408651324.0, + "step": 10706 + }, + { + "epoch": 1.3620404528685919, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8812453746795654, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8691834211349487, + "num_tokens": 408690533.0, + "step": 10707 + }, + { + "epoch": 1.3621676631471824, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.967611312866211, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8566899299621582, + "num_tokens": 408725970.0, + "step": 10708 + }, + { + "epoch": 1.362294873425773, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.01198673248291, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8717458844184875, + "num_tokens": 408766550.0, + "step": 10709 + }, + { + "epoch": 1.3624220837043632, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 7.830485820770264, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8667551279067993, + "num_tokens": 408804320.0, + "step": 10710 + }, + { + "epoch": 1.3625492939829538, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.093017101287842, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8668634295463562, + "num_tokens": 408840902.0, + "step": 10711 + }, + { + "epoch": 1.3626765042615443, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9229435920715332, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8632602691650391, + "num_tokens": 408879346.0, + "step": 10712 + }, + { + "epoch": 1.3628037145401348, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8492534160614014, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8761172890663147, + "num_tokens": 408919336.0, + "step": 10713 + }, + { + "epoch": 1.3629309248187254, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8256586790084839, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8588632345199585, + "num_tokens": 408961547.0, + "step": 10714 + }, + { + "epoch": 1.3630581350973159, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6869877576828003, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8775482773780823, + "num_tokens": 409003573.0, + "step": 10715 + }, + { + "epoch": 1.3631853453759064, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.93043053150177, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8567917943000793, + "num_tokens": 409045560.0, + "step": 10716 + }, + { + "epoch": 1.363312555654497, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8145442008972168, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8673370480537415, + "num_tokens": 409087245.0, + "step": 10717 + }, + { + "epoch": 1.3634397659330875, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7917232513427734, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8734534382820129, + "num_tokens": 409134177.0, + "step": 10718 + }, + { + "epoch": 1.3635669762116778, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.914247751235962, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8495612144470215, + "num_tokens": 409169495.0, + "step": 10719 + }, + { + "epoch": 1.3636941864902683, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8139466047286987, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8396246433258057, + "num_tokens": 409207781.0, + "step": 10720 + }, + { + "epoch": 1.3638213967688588, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.5647220611572266, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8626753091812134, + "num_tokens": 409245411.0, + "step": 10721 + }, + { + "epoch": 1.3639486070474494, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0253186225891113, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8731237649917603, + "num_tokens": 409281757.0, + "step": 10722 + }, + { + "epoch": 1.3640758173260399, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.042248010635376, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8517743349075317, + "num_tokens": 409321317.0, + "step": 10723 + }, + { + "epoch": 1.3642030276046304, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9899886846542358, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8702884912490845, + "num_tokens": 409358919.0, + "step": 10724 + }, + { + "epoch": 1.364330237883221, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9536547660827637, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8553402423858643, + "num_tokens": 409394355.0, + "step": 10725 + }, + { + "epoch": 1.3644574481618115, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.77049720287323, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8736939430236816, + "num_tokens": 409433781.0, + "step": 10726 + }, + { + "epoch": 1.364584658440402, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.849694848060608, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8531898260116577, + "num_tokens": 409471728.0, + "step": 10727 + }, + { + "epoch": 1.3647118687189925, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8472723960876465, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8830170631408691, + "num_tokens": 409502294.0, + "step": 10728 + }, + { + "epoch": 1.364839078997583, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8436697721481323, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8652694225311279, + "num_tokens": 409545878.0, + "step": 10729 + }, + { + "epoch": 1.3649662892761736, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9213954210281372, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8664961457252502, + "num_tokens": 409584753.0, + "step": 10730 + }, + { + "epoch": 1.3650934995547641, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.2237675189971924, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8681334257125854, + "num_tokens": 409613725.0, + "step": 10731 + }, + { + "epoch": 1.3652207098333546, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9547605514526367, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8559027314186096, + "num_tokens": 409652443.0, + "step": 10732 + }, + { + "epoch": 1.3653479201119452, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9308658838272095, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.869956910610199, + "num_tokens": 409690683.0, + "step": 10733 + }, + { + "epoch": 1.3654751303905355, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.043891668319702, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8641691207885742, + "num_tokens": 409727502.0, + "step": 10734 + }, + { + "epoch": 1.365602340669126, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9679335355758667, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8599529266357422, + "num_tokens": 409764592.0, + "step": 10735 + }, + { + "epoch": 1.3657295509477165, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8415226936340332, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8424972295761108, + "num_tokens": 409809645.0, + "step": 10736 + }, + { + "epoch": 1.365856761226307, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9272438287734985, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8634370565414429, + "num_tokens": 409848048.0, + "step": 10737 + }, + { + "epoch": 1.3659839715048976, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9487426280975342, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8578910231590271, + "num_tokens": 409884176.0, + "step": 10738 + }, + { + "epoch": 1.3661111817834881, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8853477239608765, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8656209111213684, + "num_tokens": 409922630.0, + "step": 10739 + }, + { + "epoch": 1.3662383920620786, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8875126838684082, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8701140880584717, + "num_tokens": 409961876.0, + "step": 10740 + }, + { + "epoch": 1.3663656023406692, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1016879081726074, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8718106746673584, + "num_tokens": 409995432.0, + "step": 10741 + }, + { + "epoch": 1.3664928126192597, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9162046909332275, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8626531958580017, + "num_tokens": 410034876.0, + "step": 10742 + }, + { + "epoch": 1.3666200228978502, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7247029542922974, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8676772117614746, + "num_tokens": 410079190.0, + "step": 10743 + }, + { + "epoch": 1.3667472331764405, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.945676565170288, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8675628900527954, + "num_tokens": 410112475.0, + "step": 10744 + }, + { + "epoch": 1.366874443455031, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6772757768630981, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8788014054298401, + "num_tokens": 410154583.0, + "step": 10745 + }, + { + "epoch": 1.3670016537336216, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.875745415687561, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8637349605560303, + "num_tokens": 410191631.0, + "step": 10746 + }, + { + "epoch": 1.3671288640122121, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9036856889724731, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8425068855285645, + "num_tokens": 410232435.0, + "step": 10747 + }, + { + "epoch": 1.3672560742908026, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1485705375671387, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.868302583694458, + "num_tokens": 410272940.0, + "step": 10748 + }, + { + "epoch": 1.3673832845693932, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8427038192749023, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8594081401824951, + "num_tokens": 410312807.0, + "step": 10749 + }, + { + "epoch": 1.3675104948479837, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.783561110496521, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.869474470615387, + "num_tokens": 410352433.0, + "step": 10750 + }, + { + "epoch": 1.3676377051265742, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.900245189666748, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8640322685241699, + "num_tokens": 410394843.0, + "step": 10751 + }, + { + "epoch": 1.3677649154051648, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7244495153427124, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8656324148178101, + "num_tokens": 410439353.0, + "step": 10752 + }, + { + "epoch": 1.3678921256837553, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.830282211303711, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8419454097747803, + "num_tokens": 410485789.0, + "step": 10753 + }, + { + "epoch": 1.3680193359623458, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.841153860092163, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8475261330604553, + "num_tokens": 410523385.0, + "step": 10754 + }, + { + "epoch": 1.3681465462409363, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9892351627349854, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8578231930732727, + "num_tokens": 410553340.0, + "step": 10755 + }, + { + "epoch": 1.3682737565195269, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8377381563186646, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8726957440376282, + "num_tokens": 410590041.0, + "step": 10756 + }, + { + "epoch": 1.3684009667981174, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7685692310333252, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.865166425704956, + "num_tokens": 410627703.0, + "step": 10757 + }, + { + "epoch": 1.368528177076708, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.846877098083496, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8552570343017578, + "num_tokens": 410662555.0, + "step": 10758 + }, + { + "epoch": 1.3686553873552982, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7459288835525513, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8468998670578003, + "num_tokens": 410706026.0, + "step": 10759 + }, + { + "epoch": 1.3687825976338888, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2626116275787354, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8607021570205688, + "num_tokens": 410743275.0, + "step": 10760 + }, + { + "epoch": 1.3689098079124793, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.181877851486206, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8735232353210449, + "num_tokens": 410781220.0, + "step": 10761 + }, + { + "epoch": 1.3690370181910698, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8857454061508179, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8713099956512451, + "num_tokens": 410820014.0, + "step": 10762 + }, + { + "epoch": 1.3691642284696603, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8098138570785522, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8588651418685913, + "num_tokens": 410858721.0, + "step": 10763 + }, + { + "epoch": 1.3692914387482509, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8023631572723389, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8695069551467896, + "num_tokens": 410898527.0, + "step": 10764 + }, + { + "epoch": 1.3694186490268414, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.791646122932434, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8635576963424683, + "num_tokens": 410938426.0, + "step": 10765 + }, + { + "epoch": 1.369545859305432, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.117889642715454, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8683319687843323, + "num_tokens": 410969940.0, + "step": 10766 + }, + { + "epoch": 1.3696730695840225, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9786189794540405, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8469449281692505, + "num_tokens": 411008859.0, + "step": 10767 + }, + { + "epoch": 1.3698002798626128, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9641982316970825, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8772135376930237, + "num_tokens": 411043516.0, + "step": 10768 + }, + { + "epoch": 1.3699274901412033, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8812967538833618, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8573102951049805, + "num_tokens": 411079092.0, + "step": 10769 + }, + { + "epoch": 1.3700547004197938, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8921136856079102, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8675374388694763, + "num_tokens": 411117360.0, + "step": 10770 + }, + { + "epoch": 1.3701819106983844, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9317877292633057, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8533101081848145, + "num_tokens": 411153615.0, + "step": 10771 + }, + { + "epoch": 1.3703091209769749, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7738839387893677, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8651416301727295, + "num_tokens": 411193571.0, + "step": 10772 + }, + { + "epoch": 1.3704363312555654, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8745920658111572, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8441280722618103, + "num_tokens": 411230427.0, + "step": 10773 + }, + { + "epoch": 1.370563541534156, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.825260877609253, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8629961609840393, + "num_tokens": 411271251.0, + "step": 10774 + }, + { + "epoch": 1.3706907518127465, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.05751633644104, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8649391531944275, + "num_tokens": 411304906.0, + "step": 10775 + }, + { + "epoch": 1.370817962091337, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8879313468933105, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8804922103881836, + "num_tokens": 411341417.0, + "step": 10776 + }, + { + "epoch": 1.3709451723699275, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8859440088272095, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.865638017654419, + "num_tokens": 411382118.0, + "step": 10777 + }, + { + "epoch": 1.371072382648518, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.785317063331604, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8525002002716064, + "num_tokens": 411426702.0, + "step": 10778 + }, + { + "epoch": 1.3711995929271086, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9429597854614258, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8625819087028503, + "num_tokens": 411461689.0, + "step": 10779 + }, + { + "epoch": 1.371326803205699, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9209785461425781, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8650161027908325, + "num_tokens": 411499613.0, + "step": 10780 + }, + { + "epoch": 1.3714540134842896, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0275180339813232, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8623420596122742, + "num_tokens": 411538422.0, + "step": 10781 + }, + { + "epoch": 1.3715812237628802, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.97652006149292, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8768250942230225, + "num_tokens": 411572401.0, + "step": 10782 + }, + { + "epoch": 1.3717084340414705, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.841723918914795, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.869156539440155, + "num_tokens": 411611268.0, + "step": 10783 + }, + { + "epoch": 1.371835644320061, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8969119787216187, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8626410961151123, + "num_tokens": 411648837.0, + "step": 10784 + }, + { + "epoch": 1.3719628545986515, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9262267351150513, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.868216872215271, + "num_tokens": 411690327.0, + "step": 10785 + }, + { + "epoch": 1.372090064877242, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.871663212776184, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8629885911941528, + "num_tokens": 411729665.0, + "step": 10786 + }, + { + "epoch": 1.3722172751558326, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8441734313964844, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8763977289199829, + "num_tokens": 411764716.0, + "step": 10787 + }, + { + "epoch": 1.3723444854344231, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9302377700805664, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8644741773605347, + "num_tokens": 411805560.0, + "step": 10788 + }, + { + "epoch": 1.3724716957130136, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9008594751358032, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8670397400856018, + "num_tokens": 411841151.0, + "step": 10789 + }, + { + "epoch": 1.3725989059916042, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9942859411239624, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8697162866592407, + "num_tokens": 411875088.0, + "step": 10790 + }, + { + "epoch": 1.3727261162701947, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.925100326538086, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8630577325820923, + "num_tokens": 411914554.0, + "step": 10791 + }, + { + "epoch": 1.3728533265487852, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8873252868652344, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8646818399429321, + "num_tokens": 411952237.0, + "step": 10792 + }, + { + "epoch": 1.3729805368273755, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.832192301750183, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8582428693771362, + "num_tokens": 411989676.0, + "step": 10793 + }, + { + "epoch": 1.373107747105966, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.826564908027649, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8771381378173828, + "num_tokens": 412025804.0, + "step": 10794 + }, + { + "epoch": 1.3732349573845566, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.870350956916809, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8798930644989014, + "num_tokens": 412060085.0, + "step": 10795 + }, + { + "epoch": 1.3733621676631471, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8180179595947266, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8704432845115662, + "num_tokens": 412100750.0, + "step": 10796 + }, + { + "epoch": 1.3734893779417376, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6886849403381348, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8663083910942078, + "num_tokens": 412145829.0, + "step": 10797 + }, + { + "epoch": 1.3736165882203282, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8000236749649048, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8704203367233276, + "num_tokens": 412187607.0, + "step": 10798 + }, + { + "epoch": 1.3737437984989187, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8887193202972412, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8508502244949341, + "num_tokens": 412227214.0, + "step": 10799 + }, + { + "epoch": 1.3738710087775092, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8727662563323975, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8663288950920105, + "num_tokens": 412261929.0, + "step": 10800 + }, + { + "epoch": 1.3739982190560998, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.926629662513733, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8600434064865112, + "num_tokens": 412299441.0, + "step": 10801 + }, + { + "epoch": 1.3741254293346903, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8735384941101074, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8626017570495605, + "num_tokens": 412339248.0, + "step": 10802 + }, + { + "epoch": 1.3742526396132808, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.98198401927948, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8626411557197571, + "num_tokens": 412377224.0, + "step": 10803 + }, + { + "epoch": 1.3743798498918713, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0117337703704834, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8414255976676941, + "num_tokens": 412415331.0, + "step": 10804 + }, + { + "epoch": 1.3745070601704619, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8651440143585205, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8629236221313477, + "num_tokens": 412455604.0, + "step": 10805 + }, + { + "epoch": 1.3746342704490524, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9977703094482422, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8701144456863403, + "num_tokens": 412489295.0, + "step": 10806 + }, + { + "epoch": 1.374761480727643, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.977131962776184, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8600226640701294, + "num_tokens": 412525750.0, + "step": 10807 + }, + { + "epoch": 1.3748886910062332, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9076586961746216, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8728609681129456, + "num_tokens": 412567163.0, + "step": 10808 + }, + { + "epoch": 1.3750159012848238, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9757286310195923, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8714951276779175, + "num_tokens": 412598789.0, + "step": 10809 + }, + { + "epoch": 1.3751431115634143, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.92697012424469, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8673575520515442, + "num_tokens": 412632588.0, + "step": 10810 + }, + { + "epoch": 1.3752703218420048, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1597683429718018, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8630242347717285, + "num_tokens": 412676369.0, + "step": 10811 + }, + { + "epoch": 1.3753975321205953, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8470828533172607, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8787100315093994, + "num_tokens": 412712674.0, + "step": 10812 + }, + { + "epoch": 1.3755247423991859, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8094404935836792, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8579504489898682, + "num_tokens": 412753531.0, + "step": 10813 + }, + { + "epoch": 1.3756519526777764, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8400553464889526, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8617355823516846, + "num_tokens": 412799450.0, + "step": 10814 + }, + { + "epoch": 1.375779162956367, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.77504301071167, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8864170908927917, + "num_tokens": 412836858.0, + "step": 10815 + }, + { + "epoch": 1.3759063732349575, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8295892477035522, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8678943514823914, + "num_tokens": 412871169.0, + "step": 10816 + }, + { + "epoch": 1.3760335835135478, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8846687078475952, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8696105480194092, + "num_tokens": 412909281.0, + "step": 10817 + }, + { + "epoch": 1.3761607937921383, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8152515888214111, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8639823198318481, + "num_tokens": 412950126.0, + "step": 10818 + }, + { + "epoch": 1.3762880040707288, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8443723917007446, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8523205518722534, + "num_tokens": 412988644.0, + "step": 10819 + }, + { + "epoch": 1.3764152143493193, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9849780797958374, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8509665131568909, + "num_tokens": 413028403.0, + "step": 10820 + }, + { + "epoch": 1.3765424246279099, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0452821254730225, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8550752997398376, + "num_tokens": 413067429.0, + "step": 10821 + }, + { + "epoch": 1.3766696349065004, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.185912847518921, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8603190779685974, + "num_tokens": 413101630.0, + "step": 10822 + }, + { + "epoch": 1.376796845185091, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8240152597427368, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8569027185440063, + "num_tokens": 413147757.0, + "step": 10823 + }, + { + "epoch": 1.3769240554636815, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.84660005569458, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.866613507270813, + "num_tokens": 413186803.0, + "step": 10824 + }, + { + "epoch": 1.377051265742272, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8878862857818604, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8584513664245605, + "num_tokens": 413227201.0, + "step": 10825 + }, + { + "epoch": 1.3771784760208625, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7369394302368164, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8631114959716797, + "num_tokens": 413270594.0, + "step": 10826 + }, + { + "epoch": 1.377305686299453, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9637668132781982, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8459075093269348, + "num_tokens": 413307345.0, + "step": 10827 + }, + { + "epoch": 1.3774328965780436, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.849360466003418, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8699531555175781, + "num_tokens": 413344310.0, + "step": 10828 + }, + { + "epoch": 1.377560106856634, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8783303499221802, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8823080062866211, + "num_tokens": 413382047.0, + "step": 10829 + }, + { + "epoch": 1.3776873171352246, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8591108322143555, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8613607883453369, + "num_tokens": 413424658.0, + "step": 10830 + }, + { + "epoch": 1.3778145274138152, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.126941680908203, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8731979131698608, + "num_tokens": 413458386.0, + "step": 10831 + }, + { + "epoch": 1.3779417376924055, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.251657724380493, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8514125943183899, + "num_tokens": 413492530.0, + "step": 10832 + }, + { + "epoch": 1.378068947970996, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8695361614227295, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8754687905311584, + "num_tokens": 413528894.0, + "step": 10833 + }, + { + "epoch": 1.3781961582495865, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9517604112625122, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8591355085372925, + "num_tokens": 413564765.0, + "step": 10834 + }, + { + "epoch": 1.378323368528177, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1048765182495117, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8376248478889465, + "num_tokens": 413598122.0, + "step": 10835 + }, + { + "epoch": 1.3784505788067676, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9192160367965698, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8467220067977905, + "num_tokens": 413634519.0, + "step": 10836 + }, + { + "epoch": 1.378577789085358, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9526619911193848, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8555436730384827, + "num_tokens": 413674021.0, + "step": 10837 + }, + { + "epoch": 1.3787049993639486, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.729190468788147, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8696813583374023, + "num_tokens": 413713804.0, + "step": 10838 + }, + { + "epoch": 1.3788322096425392, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9016231298446655, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8708489537239075, + "num_tokens": 413755665.0, + "step": 10839 + }, + { + "epoch": 1.3789594199211297, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.005472421646118, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8622443675994873, + "num_tokens": 413790872.0, + "step": 10840 + }, + { + "epoch": 1.3790866301997202, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8667049407958984, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8834875226020813, + "num_tokens": 413830798.0, + "step": 10841 + }, + { + "epoch": 1.3792138404783105, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9673506021499634, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8684101104736328, + "num_tokens": 413865960.0, + "step": 10842 + }, + { + "epoch": 1.379341050756901, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.973891258239746, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8748253583908081, + "num_tokens": 413898222.0, + "step": 10843 + }, + { + "epoch": 1.3794682610354916, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1888742446899414, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8586019277572632, + "num_tokens": 413931839.0, + "step": 10844 + }, + { + "epoch": 1.3795954713140821, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.219984769821167, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8548577427864075, + "num_tokens": 413963815.0, + "step": 10845 + }, + { + "epoch": 1.3797226815926726, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 80.52790832519531, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8721227645874023, + "num_tokens": 414002476.0, + "step": 10846 + }, + { + "epoch": 1.3798498918712632, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2996110916137695, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8587812185287476, + "num_tokens": 414041121.0, + "step": 10847 + }, + { + "epoch": 1.3799771021498537, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1872856616973877, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8508129715919495, + "num_tokens": 414080447.0, + "step": 10848 + }, + { + "epoch": 1.3801043124284442, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2613089084625244, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8520719408988953, + "num_tokens": 414108925.0, + "step": 10849 + }, + { + "epoch": 1.3802315227070348, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7172203063964844, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8681071996688843, + "num_tokens": 414154296.0, + "step": 10850 + }, + { + "epoch": 1.3803587329856253, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.760825276374817, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.852374792098999, + "num_tokens": 414194612.0, + "step": 10851 + }, + { + "epoch": 1.3804859432642158, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.005232572555542, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.866335391998291, + "num_tokens": 414229939.0, + "step": 10852 + }, + { + "epoch": 1.3806131535428063, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9779918193817139, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.860102117061615, + "num_tokens": 414265193.0, + "step": 10853 + }, + { + "epoch": 1.3807403638213969, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0098016262054443, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8614493608474731, + "num_tokens": 414301471.0, + "step": 10854 + }, + { + "epoch": 1.3808675740999874, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.452775478363037, + "learning_rate": 1e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8357226848602295, + "num_tokens": 414338755.0, + "step": 10855 + }, + { + "epoch": 1.380994784378578, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7789714336395264, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8651123046875, + "num_tokens": 414379996.0, + "step": 10856 + }, + { + "epoch": 1.3811219946571682, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8533354997634888, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8654395937919617, + "num_tokens": 414421905.0, + "step": 10857 + }, + { + "epoch": 1.3812492049357588, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6730729341506958, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8694339990615845, + "num_tokens": 414465149.0, + "step": 10858 + }, + { + "epoch": 1.3813764152143493, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7086455821990967, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8733178377151489, + "num_tokens": 414504086.0, + "step": 10859 + }, + { + "epoch": 1.3815036254929398, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8148894309997559, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8475033044815063, + "num_tokens": 414545392.0, + "step": 10860 + }, + { + "epoch": 1.3816308357715303, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9771684408187866, + "learning_rate": 1e-06, + "loss": 0.544, + "mean_token_accuracy": 0.8283427357673645, + "num_tokens": 414586445.0, + "step": 10861 + }, + { + "epoch": 1.3817580460501209, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9836074113845825, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8725048303604126, + "num_tokens": 414618467.0, + "step": 10862 + }, + { + "epoch": 1.3818852563287114, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.003746509552002, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.873421847820282, + "num_tokens": 414651253.0, + "step": 10863 + }, + { + "epoch": 1.382012466607302, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8771061897277832, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8645682334899902, + "num_tokens": 414692016.0, + "step": 10864 + }, + { + "epoch": 1.3821396768858925, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9628076553344727, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8684175610542297, + "num_tokens": 414731840.0, + "step": 10865 + }, + { + "epoch": 1.3822668871644828, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0817198753356934, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8660897612571716, + "num_tokens": 414767110.0, + "step": 10866 + }, + { + "epoch": 1.3823940974430733, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8822232484817505, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8686343431472778, + "num_tokens": 414799340.0, + "step": 10867 + }, + { + "epoch": 1.3825213077216638, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9263815879821777, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8680608868598938, + "num_tokens": 414840915.0, + "step": 10868 + }, + { + "epoch": 1.3826485180002543, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.953896164894104, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8521180152893066, + "num_tokens": 414877298.0, + "step": 10869 + }, + { + "epoch": 1.3827757282788449, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7832810878753662, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.871077299118042, + "num_tokens": 414916431.0, + "step": 10870 + }, + { + "epoch": 1.3829029385574354, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9434230327606201, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8618671298027039, + "num_tokens": 414958148.0, + "step": 10871 + }, + { + "epoch": 1.383030148836026, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8442895412445068, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8903513550758362, + "num_tokens": 414997008.0, + "step": 10872 + }, + { + "epoch": 1.3831573591146165, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.828184962272644, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8509895205497742, + "num_tokens": 415042893.0, + "step": 10873 + }, + { + "epoch": 1.383284569393207, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8911477327346802, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8621141910552979, + "num_tokens": 415077476.0, + "step": 10874 + }, + { + "epoch": 1.3834117796717975, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7280632257461548, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.858891487121582, + "num_tokens": 415124329.0, + "step": 10875 + }, + { + "epoch": 1.383538989950388, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7765687704086304, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8761876225471497, + "num_tokens": 415164101.0, + "step": 10876 + }, + { + "epoch": 1.3836662002289786, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9747778177261353, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8656454086303711, + "num_tokens": 415203792.0, + "step": 10877 + }, + { + "epoch": 1.383793410507569, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1071701049804688, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8603900671005249, + "num_tokens": 415233754.0, + "step": 10878 + }, + { + "epoch": 1.3839206207861596, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.064513683319092, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8724479675292969, + "num_tokens": 415269567.0, + "step": 10879 + }, + { + "epoch": 1.3840478310647502, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8685188293457031, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8672839403152466, + "num_tokens": 415312660.0, + "step": 10880 + }, + { + "epoch": 1.3841750413433405, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.754464030265808, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8751664161682129, + "num_tokens": 415355377.0, + "step": 10881 + }, + { + "epoch": 1.384302251621931, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7931280136108398, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8754342198371887, + "num_tokens": 415393109.0, + "step": 10882 + }, + { + "epoch": 1.3844294619005215, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7402300834655762, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8759634494781494, + "num_tokens": 415428650.0, + "step": 10883 + }, + { + "epoch": 1.384556672179112, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.774918794631958, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8606452345848083, + "num_tokens": 415467166.0, + "step": 10884 + }, + { + "epoch": 1.3846838824577026, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7488218545913696, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.877322793006897, + "num_tokens": 415505633.0, + "step": 10885 + }, + { + "epoch": 1.384811092736293, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9455071687698364, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.864333987236023, + "num_tokens": 415546254.0, + "step": 10886 + }, + { + "epoch": 1.3849383030148836, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.779540777206421, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.86325603723526, + "num_tokens": 415584999.0, + "step": 10887 + }, + { + "epoch": 1.3850655132934742, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8483461141586304, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8670119643211365, + "num_tokens": 415624608.0, + "step": 10888 + }, + { + "epoch": 1.3851927235720647, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9292535781860352, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8509334325790405, + "num_tokens": 415663405.0, + "step": 10889 + }, + { + "epoch": 1.385319933850655, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.859304666519165, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8791032433509827, + "num_tokens": 415698934.0, + "step": 10890 + }, + { + "epoch": 1.3854471441292455, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9863026142120361, + "learning_rate": 1e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.835929274559021, + "num_tokens": 415736973.0, + "step": 10891 + }, + { + "epoch": 1.385574354407836, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.800233006477356, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8675556778907776, + "num_tokens": 415778364.0, + "step": 10892 + }, + { + "epoch": 1.3857015646864266, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8026920557022095, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8799521923065186, + "num_tokens": 415817502.0, + "step": 10893 + }, + { + "epoch": 1.385828774965017, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.710574984550476, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8590843677520752, + "num_tokens": 415860364.0, + "step": 10894 + }, + { + "epoch": 1.3859559852436076, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8455288410186768, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8671849370002747, + "num_tokens": 415896816.0, + "step": 10895 + }, + { + "epoch": 1.3860831955221982, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8428196907043457, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.846080482006073, + "num_tokens": 415935173.0, + "step": 10896 + }, + { + "epoch": 1.3862104058007887, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0422239303588867, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8736107349395752, + "num_tokens": 415977043.0, + "step": 10897 + }, + { + "epoch": 1.3863376160793792, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.983418583869934, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8732811212539673, + "num_tokens": 416014376.0, + "step": 10898 + }, + { + "epoch": 1.3864648263579697, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8454970121383667, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8520486950874329, + "num_tokens": 416055648.0, + "step": 10899 + }, + { + "epoch": 1.3865920366365603, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.037935495376587, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8558632135391235, + "num_tokens": 416087062.0, + "step": 10900 + }, + { + "epoch": 1.3867192469151508, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9564599990844727, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8712139129638672, + "num_tokens": 416122559.0, + "step": 10901 + }, + { + "epoch": 1.3868464571937413, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9464224576950073, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.856123685836792, + "num_tokens": 416155228.0, + "step": 10902 + }, + { + "epoch": 1.3869736674723319, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.072033643722534, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8503861427307129, + "num_tokens": 416191190.0, + "step": 10903 + }, + { + "epoch": 1.3871008777509224, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.967821478843689, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8674842119216919, + "num_tokens": 416228009.0, + "step": 10904 + }, + { + "epoch": 1.387228088029513, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9215155839920044, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8604274392127991, + "num_tokens": 416263657.0, + "step": 10905 + }, + { + "epoch": 1.3873552983081032, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8839067220687866, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8569615483283997, + "num_tokens": 416305187.0, + "step": 10906 + }, + { + "epoch": 1.3874825085866938, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9321436882019043, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8612958192825317, + "num_tokens": 416342578.0, + "step": 10907 + }, + { + "epoch": 1.3876097188652843, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9539960622787476, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8603655695915222, + "num_tokens": 416378969.0, + "step": 10908 + }, + { + "epoch": 1.3877369291438748, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.045243501663208, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.861707329750061, + "num_tokens": 416412958.0, + "step": 10909 + }, + { + "epoch": 1.3878641394224653, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.043064594268799, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8442145586013794, + "num_tokens": 416445643.0, + "step": 10910 + }, + { + "epoch": 1.3879913497010559, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.898897409439087, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8500915765762329, + "num_tokens": 416484891.0, + "step": 10911 + }, + { + "epoch": 1.3881185599796464, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1990466117858887, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8512040972709656, + "num_tokens": 416520504.0, + "step": 10912 + }, + { + "epoch": 1.388245770258237, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9644789695739746, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.864073634147644, + "num_tokens": 416561877.0, + "step": 10913 + }, + { + "epoch": 1.3883729805368275, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.271782398223877, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8641496896743774, + "num_tokens": 416597676.0, + "step": 10914 + }, + { + "epoch": 1.3885001908154178, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.955479383468628, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8698099255561829, + "num_tokens": 416630557.0, + "step": 10915 + }, + { + "epoch": 1.3886274010940083, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8121947050094604, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8654589653015137, + "num_tokens": 416668604.0, + "step": 10916 + }, + { + "epoch": 1.3887546113725988, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8238130807876587, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8438602089881897, + "num_tokens": 416710246.0, + "step": 10917 + }, + { + "epoch": 1.3888818216511893, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8661638498306274, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8644767999649048, + "num_tokens": 416744962.0, + "step": 10918 + }, + { + "epoch": 1.3890090319297799, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9386183023452759, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8654183745384216, + "num_tokens": 416779483.0, + "step": 10919 + }, + { + "epoch": 1.3891362422083704, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8425699472427368, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8763630390167236, + "num_tokens": 416822135.0, + "step": 10920 + }, + { + "epoch": 1.389263452486961, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9228273630142212, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8611564636230469, + "num_tokens": 416860383.0, + "step": 10921 + }, + { + "epoch": 1.3893906627655515, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8872677087783813, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8587354421615601, + "num_tokens": 416896960.0, + "step": 10922 + }, + { + "epoch": 1.389517873044142, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.915579080581665, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.874910831451416, + "num_tokens": 416931981.0, + "step": 10923 + }, + { + "epoch": 1.3896450833227325, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3035731315612793, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8729043006896973, + "num_tokens": 416968179.0, + "step": 10924 + }, + { + "epoch": 1.389772293601323, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8700319528579712, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.868381679058075, + "num_tokens": 417008879.0, + "step": 10925 + }, + { + "epoch": 1.3898995038799136, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8183984756469727, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8750470876693726, + "num_tokens": 417042497.0, + "step": 10926 + }, + { + "epoch": 1.390026714158504, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7921587228775024, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8653676509857178, + "num_tokens": 417084396.0, + "step": 10927 + }, + { + "epoch": 1.3901539244370946, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7813115119934082, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8766255378723145, + "num_tokens": 417121014.0, + "step": 10928 + }, + { + "epoch": 1.3902811347156852, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.128383159637451, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8616564273834229, + "num_tokens": 417160169.0, + "step": 10929 + }, + { + "epoch": 1.3904083449942755, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.196119785308838, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8477017879486084, + "num_tokens": 417195024.0, + "step": 10930 + }, + { + "epoch": 1.390535555272866, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9004911184310913, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8699359893798828, + "num_tokens": 417231344.0, + "step": 10931 + }, + { + "epoch": 1.3906627655514565, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8540642261505127, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8544659614562988, + "num_tokens": 417270266.0, + "step": 10932 + }, + { + "epoch": 1.390789975830047, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8888107538223267, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8610031008720398, + "num_tokens": 417308292.0, + "step": 10933 + }, + { + "epoch": 1.3909171861086376, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8145973682403564, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8600614070892334, + "num_tokens": 417346358.0, + "step": 10934 + }, + { + "epoch": 1.391044396387228, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9174734354019165, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8642465472221375, + "num_tokens": 417381924.0, + "step": 10935 + }, + { + "epoch": 1.3911716066658186, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4822943210601807, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.837158203125, + "num_tokens": 417417779.0, + "step": 10936 + }, + { + "epoch": 1.3912988169444092, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.909674882888794, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8553287386894226, + "num_tokens": 417459076.0, + "step": 10937 + }, + { + "epoch": 1.3914260272229997, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.843185305595398, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8632598519325256, + "num_tokens": 417499422.0, + "step": 10938 + }, + { + "epoch": 1.39155323750159, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.838505744934082, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8639662265777588, + "num_tokens": 417541776.0, + "step": 10939 + }, + { + "epoch": 1.3916804477801805, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0884077548980713, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8507434129714966, + "num_tokens": 417576469.0, + "step": 10940 + }, + { + "epoch": 1.391807658058771, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7879058122634888, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8648247718811035, + "num_tokens": 417611726.0, + "step": 10941 + }, + { + "epoch": 1.3919348683373616, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.025710105895996, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8559565544128418, + "num_tokens": 417650135.0, + "step": 10942 + }, + { + "epoch": 1.392062078615952, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1303160190582275, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8448305130004883, + "num_tokens": 417690981.0, + "step": 10943 + }, + { + "epoch": 1.3921892888945426, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.958240032196045, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8502479195594788, + "num_tokens": 417732152.0, + "step": 10944 + }, + { + "epoch": 1.3923164991731332, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8642617464065552, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8531181812286377, + "num_tokens": 417771814.0, + "step": 10945 + }, + { + "epoch": 1.3924437094517237, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1029536724090576, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8521819710731506, + "num_tokens": 417810663.0, + "step": 10946 + }, + { + "epoch": 1.3925709197303142, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9850468635559082, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8685795068740845, + "num_tokens": 417845664.0, + "step": 10947 + }, + { + "epoch": 1.3926981300089047, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.737013578414917, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8550699949264526, + "num_tokens": 417889617.0, + "step": 10948 + }, + { + "epoch": 1.3928253402874953, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8572674989700317, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8597191572189331, + "num_tokens": 417930820.0, + "step": 10949 + }, + { + "epoch": 1.3929525505660858, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9429231882095337, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8648805618286133, + "num_tokens": 417966799.0, + "step": 10950 + }, + { + "epoch": 1.3930797608446763, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.781307578086853, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8554673194885254, + "num_tokens": 418009328.0, + "step": 10951 + }, + { + "epoch": 1.3932069711232669, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9163585901260376, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8652846813201904, + "num_tokens": 418048548.0, + "step": 10952 + }, + { + "epoch": 1.3933341814018574, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7681835889816284, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8603488206863403, + "num_tokens": 418094947.0, + "step": 10953 + }, + { + "epoch": 1.393461391680448, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8805166482925415, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8530791997909546, + "num_tokens": 418137144.0, + "step": 10954 + }, + { + "epoch": 1.3935886019590382, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.033773422241211, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8832242488861084, + "num_tokens": 418171449.0, + "step": 10955 + }, + { + "epoch": 1.3937158122376287, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.863490104675293, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8647972345352173, + "num_tokens": 418209854.0, + "step": 10956 + }, + { + "epoch": 1.3938430225162193, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9785698652267456, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8715076446533203, + "num_tokens": 418245121.0, + "step": 10957 + }, + { + "epoch": 1.3939702327948098, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.759750485420227, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8668392300605774, + "num_tokens": 418284158.0, + "step": 10958 + }, + { + "epoch": 1.3940974430734003, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8355317115783691, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8703221082687378, + "num_tokens": 418324091.0, + "step": 10959 + }, + { + "epoch": 1.3942246533519909, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9733788967132568, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8635067939758301, + "num_tokens": 418364764.0, + "step": 10960 + }, + { + "epoch": 1.3943518636305814, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8261184692382812, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8602499961853027, + "num_tokens": 418403974.0, + "step": 10961 + }, + { + "epoch": 1.394479073909172, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9789059162139893, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8520040512084961, + "num_tokens": 418446341.0, + "step": 10962 + }, + { + "epoch": 1.3946062841877624, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8264904022216797, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8625537157058716, + "num_tokens": 418490989.0, + "step": 10963 + }, + { + "epoch": 1.3947334944663528, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0844063758850098, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8580743074417114, + "num_tokens": 418524633.0, + "step": 10964 + }, + { + "epoch": 1.3948607047449433, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.041424036026001, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8626985549926758, + "num_tokens": 418562576.0, + "step": 10965 + }, + { + "epoch": 1.3949879150235338, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0206828117370605, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8625990748405457, + "num_tokens": 418596537.0, + "step": 10966 + }, + { + "epoch": 1.3951151253021243, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.107811450958252, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8593907356262207, + "num_tokens": 418625800.0, + "step": 10967 + }, + { + "epoch": 1.3952423355807149, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9781246185302734, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8482409715652466, + "num_tokens": 418663818.0, + "step": 10968 + }, + { + "epoch": 1.3953695458593054, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.886722207069397, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8687337636947632, + "num_tokens": 418700607.0, + "step": 10969 + }, + { + "epoch": 1.395496756137896, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8858376741409302, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8573986887931824, + "num_tokens": 418739357.0, + "step": 10970 + }, + { + "epoch": 1.3956239664164865, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1635429859161377, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8682827353477478, + "num_tokens": 418769752.0, + "step": 10971 + }, + { + "epoch": 1.395751176695077, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8892773389816284, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8422536849975586, + "num_tokens": 418808196.0, + "step": 10972 + }, + { + "epoch": 1.3958783869736675, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8248767852783203, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8787673711776733, + "num_tokens": 418843335.0, + "step": 10973 + }, + { + "epoch": 1.396005597252258, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9747987985610962, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8706196546554565, + "num_tokens": 418876591.0, + "step": 10974 + }, + { + "epoch": 1.3961328075308486, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.992706298828125, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8501142263412476, + "num_tokens": 418910482.0, + "step": 10975 + }, + { + "epoch": 1.396260017809439, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.438833713531494, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.853938102722168, + "num_tokens": 418942248.0, + "step": 10976 + }, + { + "epoch": 1.3963872280880296, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9946198463439941, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.867243766784668, + "num_tokens": 418977770.0, + "step": 10977 + }, + { + "epoch": 1.3965144383666201, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0657706260681152, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8557467460632324, + "num_tokens": 419016464.0, + "step": 10978 + }, + { + "epoch": 1.3966416486452105, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7781041860580444, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8545577526092529, + "num_tokens": 419053254.0, + "step": 10979 + }, + { + "epoch": 1.396768858923801, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.984346866607666, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8569706082344055, + "num_tokens": 419088940.0, + "step": 10980 + }, + { + "epoch": 1.3968960692023915, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9649615287780762, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8620662689208984, + "num_tokens": 419127897.0, + "step": 10981 + }, + { + "epoch": 1.397023279480982, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8379157781600952, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8652288913726807, + "num_tokens": 419170921.0, + "step": 10982 + }, + { + "epoch": 1.3971504897595726, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9983856678009033, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8692366480827332, + "num_tokens": 419206032.0, + "step": 10983 + }, + { + "epoch": 1.397277700038163, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8988142013549805, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8780111074447632, + "num_tokens": 419244240.0, + "step": 10984 + }, + { + "epoch": 1.3974049103167536, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.906736135482788, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8716048002243042, + "num_tokens": 419274902.0, + "step": 10985 + }, + { + "epoch": 1.3975321205953442, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9448305368423462, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8739677667617798, + "num_tokens": 419312295.0, + "step": 10986 + }, + { + "epoch": 1.3976593308739347, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9451593160629272, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.846311628818512, + "num_tokens": 419350999.0, + "step": 10987 + }, + { + "epoch": 1.397786541152525, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.293087959289551, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8641585111618042, + "num_tokens": 419386739.0, + "step": 10988 + }, + { + "epoch": 1.3979137514311155, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.674822211265564, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.869236946105957, + "num_tokens": 419430353.0, + "step": 10989 + }, + { + "epoch": 1.398040961709706, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0283455848693848, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8583904504776001, + "num_tokens": 419462910.0, + "step": 10990 + }, + { + "epoch": 1.3981681719882966, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9775177240371704, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.872848391532898, + "num_tokens": 419498727.0, + "step": 10991 + }, + { + "epoch": 1.398295382266887, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9941917657852173, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8561322689056396, + "num_tokens": 419534496.0, + "step": 10992 + }, + { + "epoch": 1.3984225925454776, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9255489110946655, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8585928678512573, + "num_tokens": 419575760.0, + "step": 10993 + }, + { + "epoch": 1.3985498028240682, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.85629141330719, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8700907826423645, + "num_tokens": 419616822.0, + "step": 10994 + }, + { + "epoch": 1.3986770131026587, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1418561935424805, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8696168661117554, + "num_tokens": 419650425.0, + "step": 10995 + }, + { + "epoch": 1.3988042233812492, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1715805530548096, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8703835010528564, + "num_tokens": 419685501.0, + "step": 10996 + }, + { + "epoch": 1.3989314336598397, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8909040689468384, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8662837743759155, + "num_tokens": 419724471.0, + "step": 10997 + }, + { + "epoch": 1.3990586439384303, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7573623657226562, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8707083463668823, + "num_tokens": 419763683.0, + "step": 10998 + }, + { + "epoch": 1.3991858542170208, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8192652463912964, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8666729927062988, + "num_tokens": 419801238.0, + "step": 10999 + }, + { + "epoch": 1.3993130644956113, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9745676517486572, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8597733974456787, + "num_tokens": 419832787.0, + "step": 11000 + }, + { + "epoch": 1.3994402747742019, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0465312004089355, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8530892729759216, + "num_tokens": 419864294.0, + "step": 11001 + }, + { + "epoch": 1.3995674850527924, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9027281999588013, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8667638301849365, + "num_tokens": 419900493.0, + "step": 11002 + }, + { + "epoch": 1.399694695331383, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.049203634262085, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8806501030921936, + "num_tokens": 419938917.0, + "step": 11003 + }, + { + "epoch": 1.3998219056099732, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7969311475753784, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8789786100387573, + "num_tokens": 419978771.0, + "step": 11004 + }, + { + "epoch": 1.3999491158885637, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1053144931793213, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8619681596755981, + "num_tokens": 420011836.0, + "step": 11005 + }, + { + "epoch": 1.4000763261671543, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.935083270072937, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8427300453186035, + "num_tokens": 420051618.0, + "step": 11006 + }, + { + "epoch": 1.4002035364457448, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8451006412506104, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8494343757629395, + "num_tokens": 420089119.0, + "step": 11007 + }, + { + "epoch": 1.4003307467243353, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9037576913833618, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8735586404800415, + "num_tokens": 420123706.0, + "step": 11008 + }, + { + "epoch": 1.4004579570029259, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0094096660614014, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.860325813293457, + "num_tokens": 420156723.0, + "step": 11009 + }, + { + "epoch": 1.4005851672815164, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8028439283370972, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8630019426345825, + "num_tokens": 420198519.0, + "step": 11010 + }, + { + "epoch": 1.400712377560107, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8626213073730469, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8770418167114258, + "num_tokens": 420231722.0, + "step": 11011 + }, + { + "epoch": 1.4008395878386974, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.950989842414856, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8663405776023865, + "num_tokens": 420272059.0, + "step": 11012 + }, + { + "epoch": 1.4009667981172877, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0225327014923096, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8710217475891113, + "num_tokens": 420307695.0, + "step": 11013 + }, + { + "epoch": 1.4010940083958783, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9407110214233398, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8729634881019592, + "num_tokens": 420345989.0, + "step": 11014 + }, + { + "epoch": 1.4012212186744688, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7273197174072266, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8700269460678101, + "num_tokens": 420387194.0, + "step": 11015 + }, + { + "epoch": 1.4013484289530593, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7257788181304932, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8791407346725464, + "num_tokens": 420427608.0, + "step": 11016 + }, + { + "epoch": 1.4014756392316499, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9184552431106567, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8687281012535095, + "num_tokens": 420461363.0, + "step": 11017 + }, + { + "epoch": 1.4016028495102404, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2804667949676514, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8462811708450317, + "num_tokens": 420503577.0, + "step": 11018 + }, + { + "epoch": 1.401730059788831, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.028912305831909, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8740237355232239, + "num_tokens": 420538852.0, + "step": 11019 + }, + { + "epoch": 1.4018572700674214, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8672964572906494, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8619378805160522, + "num_tokens": 420577495.0, + "step": 11020 + }, + { + "epoch": 1.401984480346012, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8185412883758545, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8632857203483582, + "num_tokens": 420616123.0, + "step": 11021 + }, + { + "epoch": 1.4021116906246025, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8537687063217163, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8750320672988892, + "num_tokens": 420653904.0, + "step": 11022 + }, + { + "epoch": 1.402238900903193, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7340142726898193, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8882132768630981, + "num_tokens": 420693539.0, + "step": 11023 + }, + { + "epoch": 1.4023661111817836, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9903137683868408, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.866549015045166, + "num_tokens": 420729046.0, + "step": 11024 + }, + { + "epoch": 1.402493321460374, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8476442098617554, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8678823709487915, + "num_tokens": 420766143.0, + "step": 11025 + }, + { + "epoch": 1.4026205317389646, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.874947428703308, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8609782457351685, + "num_tokens": 420806367.0, + "step": 11026 + }, + { + "epoch": 1.4027477420175551, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.87003755569458, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8709845542907715, + "num_tokens": 420845081.0, + "step": 11027 + }, + { + "epoch": 1.4028749522961454, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.928622841835022, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8690158128738403, + "num_tokens": 420885833.0, + "step": 11028 + }, + { + "epoch": 1.403002162574736, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1287198066711426, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8692420125007629, + "num_tokens": 420916313.0, + "step": 11029 + }, + { + "epoch": 1.4031293728533265, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1578168869018555, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8456186056137085, + "num_tokens": 420957011.0, + "step": 11030 + }, + { + "epoch": 1.403256583131917, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.836969017982483, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8683269619941711, + "num_tokens": 420994815.0, + "step": 11031 + }, + { + "epoch": 1.4033837934105076, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8692703247070312, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8718851208686829, + "num_tokens": 421033977.0, + "step": 11032 + }, + { + "epoch": 1.403511003689098, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3114969730377197, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8675212860107422, + "num_tokens": 421068871.0, + "step": 11033 + }, + { + "epoch": 1.4036382139676886, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.833644151687622, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8635321855545044, + "num_tokens": 421105680.0, + "step": 11034 + }, + { + "epoch": 1.4037654242462791, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8503310680389404, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8766756057739258, + "num_tokens": 421149519.0, + "step": 11035 + }, + { + "epoch": 1.4038926345248697, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8311939239501953, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8656620383262634, + "num_tokens": 421190195.0, + "step": 11036 + }, + { + "epoch": 1.40401984480346, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1071863174438477, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8602224588394165, + "num_tokens": 421218637.0, + "step": 11037 + }, + { + "epoch": 1.4041470550820505, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8587584495544434, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8693166375160217, + "num_tokens": 421261300.0, + "step": 11038 + }, + { + "epoch": 1.404274265360641, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9744476079940796, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8620556592941284, + "num_tokens": 421297250.0, + "step": 11039 + }, + { + "epoch": 1.4044014756392316, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8238855600357056, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8568525314331055, + "num_tokens": 421334213.0, + "step": 11040 + }, + { + "epoch": 1.404528685917822, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8526641130447388, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8728910684585571, + "num_tokens": 421368341.0, + "step": 11041 + }, + { + "epoch": 1.4046558961964126, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8882184028625488, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8519707322120667, + "num_tokens": 421408278.0, + "step": 11042 + }, + { + "epoch": 1.4047831064750032, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9507263898849487, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.856139063835144, + "num_tokens": 421442559.0, + "step": 11043 + }, + { + "epoch": 1.4049103167535937, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1223740577697754, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.862140417098999, + "num_tokens": 421474014.0, + "step": 11044 + }, + { + "epoch": 1.4050375270321842, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.010740280151367, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8402108550071716, + "num_tokens": 421511193.0, + "step": 11045 + }, + { + "epoch": 1.4051647373107747, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8157628774642944, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8649157285690308, + "num_tokens": 421547895.0, + "step": 11046 + }, + { + "epoch": 1.4052919475893653, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9565396308898926, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8626247644424438, + "num_tokens": 421582883.0, + "step": 11047 + }, + { + "epoch": 1.4054191578679558, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8609923124313354, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8666248321533203, + "num_tokens": 421619838.0, + "step": 11048 + }, + { + "epoch": 1.4055463681465463, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9394381046295166, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8745120763778687, + "num_tokens": 421659451.0, + "step": 11049 + }, + { + "epoch": 1.4056735784251368, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8264039754867554, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8602216243743896, + "num_tokens": 421704826.0, + "step": 11050 + }, + { + "epoch": 1.4058007887037274, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9099923372268677, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8605070114135742, + "num_tokens": 421748953.0, + "step": 11051 + }, + { + "epoch": 1.405927998982318, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.119499921798706, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8463497161865234, + "num_tokens": 421786614.0, + "step": 11052 + }, + { + "epoch": 1.4060552092609082, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0909202098846436, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8590158224105835, + "num_tokens": 421817291.0, + "step": 11053 + }, + { + "epoch": 1.4061824195394987, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9973230361938477, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8612547516822815, + "num_tokens": 421858200.0, + "step": 11054 + }, + { + "epoch": 1.4063096298180893, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7969300746917725, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8631832599639893, + "num_tokens": 421903343.0, + "step": 11055 + }, + { + "epoch": 1.4064368400966798, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.896008014678955, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8798955678939819, + "num_tokens": 421941426.0, + "step": 11056 + }, + { + "epoch": 1.4065640503752703, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8583709001541138, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8609117865562439, + "num_tokens": 421983992.0, + "step": 11057 + }, + { + "epoch": 1.4066912606538609, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1442954540252686, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8722056746482849, + "num_tokens": 422015066.0, + "step": 11058 + }, + { + "epoch": 1.4068184709324514, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.897686243057251, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8663212060928345, + "num_tokens": 422052392.0, + "step": 11059 + }, + { + "epoch": 1.406945681211042, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.7381207942962646, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8611029386520386, + "num_tokens": 422091453.0, + "step": 11060 + }, + { + "epoch": 1.4070728914896324, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 7.809031009674072, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8480819463729858, + "num_tokens": 422131019.0, + "step": 11061 + }, + { + "epoch": 1.4072001017682227, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0121262073516846, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8577790260314941, + "num_tokens": 422172139.0, + "step": 11062 + }, + { + "epoch": 1.4073273120468133, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.082563638687134, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8712483048439026, + "num_tokens": 422209708.0, + "step": 11063 + }, + { + "epoch": 1.4074545223254038, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8976929187774658, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8649805188179016, + "num_tokens": 422245091.0, + "step": 11064 + }, + { + "epoch": 1.4075817326039943, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9883195161819458, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8641135692596436, + "num_tokens": 422280054.0, + "step": 11065 + }, + { + "epoch": 1.4077089428825849, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9991637468338013, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8716604709625244, + "num_tokens": 422310036.0, + "step": 11066 + }, + { + "epoch": 1.4078361531611754, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1354305744171143, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8480106592178345, + "num_tokens": 422347914.0, + "step": 11067 + }, + { + "epoch": 1.407963363439766, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5264527797698975, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8654015064239502, + "num_tokens": 422383207.0, + "step": 11068 + }, + { + "epoch": 1.4080905737183564, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0224056243896484, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8671079277992249, + "num_tokens": 422414675.0, + "step": 11069 + }, + { + "epoch": 1.408217783996947, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1290194988250732, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8527296781539917, + "num_tokens": 422449746.0, + "step": 11070 + }, + { + "epoch": 1.4083449942755375, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9524478912353516, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8573518991470337, + "num_tokens": 422487789.0, + "step": 11071 + }, + { + "epoch": 1.408472204554128, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9041129350662231, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8696402907371521, + "num_tokens": 422521750.0, + "step": 11072 + }, + { + "epoch": 1.4085994148327186, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7715978622436523, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8728376626968384, + "num_tokens": 422562923.0, + "step": 11073 + }, + { + "epoch": 1.408726625111309, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8863381147384644, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8545105457305908, + "num_tokens": 422606832.0, + "step": 11074 + }, + { + "epoch": 1.4088538353898996, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9588826894760132, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8668522834777832, + "num_tokens": 422644465.0, + "step": 11075 + }, + { + "epoch": 1.4089810456684901, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.010802745819092, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.87181556224823, + "num_tokens": 422681204.0, + "step": 11076 + }, + { + "epoch": 1.4091082559470804, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 7.791265487670898, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8532037734985352, + "num_tokens": 422716325.0, + "step": 11077 + }, + { + "epoch": 1.409235466225671, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.099156618118286, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8538601994514465, + "num_tokens": 422753644.0, + "step": 11078 + }, + { + "epoch": 1.4093626765042615, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0370066165924072, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.863542914390564, + "num_tokens": 422790893.0, + "step": 11079 + }, + { + "epoch": 1.409489886782852, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.008226156234741, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8562508821487427, + "num_tokens": 422829400.0, + "step": 11080 + }, + { + "epoch": 1.4096170970614426, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8681411743164062, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8718580007553101, + "num_tokens": 422867386.0, + "step": 11081 + }, + { + "epoch": 1.409744307340033, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8509595394134521, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8668199777603149, + "num_tokens": 422910736.0, + "step": 11082 + }, + { + "epoch": 1.4098715176186236, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7402263879776, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8683815598487854, + "num_tokens": 422950302.0, + "step": 11083 + }, + { + "epoch": 1.4099987278972141, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9239766597747803, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8630825281143188, + "num_tokens": 422985892.0, + "step": 11084 + }, + { + "epoch": 1.4101259381758047, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9211933612823486, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8557807207107544, + "num_tokens": 423020075.0, + "step": 11085 + }, + { + "epoch": 1.410253148454395, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.849555253982544, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8661272525787354, + "num_tokens": 423058260.0, + "step": 11086 + }, + { + "epoch": 1.4103803587329855, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7803986072540283, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8875010013580322, + "num_tokens": 423092158.0, + "step": 11087 + }, + { + "epoch": 1.410507569011576, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0928878784179688, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8603627681732178, + "num_tokens": 423123999.0, + "step": 11088 + }, + { + "epoch": 1.4106347792901666, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8676729202270508, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8598384857177734, + "num_tokens": 423167111.0, + "step": 11089 + }, + { + "epoch": 1.410761989568757, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.202216625213623, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8565195202827454, + "num_tokens": 423202060.0, + "step": 11090 + }, + { + "epoch": 1.4108891998473476, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9492127895355225, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8561245799064636, + "num_tokens": 423241200.0, + "step": 11091 + }, + { + "epoch": 1.4110164101259381, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7479734420776367, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8730649352073669, + "num_tokens": 423284713.0, + "step": 11092 + }, + { + "epoch": 1.4111436204045287, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.919222354888916, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.866064190864563, + "num_tokens": 423322524.0, + "step": 11093 + }, + { + "epoch": 1.4112708306831192, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9110044240951538, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8649989366531372, + "num_tokens": 423360551.0, + "step": 11094 + }, + { + "epoch": 1.4113980409617097, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9308165311813354, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8736407160758972, + "num_tokens": 423395227.0, + "step": 11095 + }, + { + "epoch": 1.4115252512403003, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0075433254241943, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8509373664855957, + "num_tokens": 423434757.0, + "step": 11096 + }, + { + "epoch": 1.4116524615188908, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.974625587463379, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8822280764579773, + "num_tokens": 423464599.0, + "step": 11097 + }, + { + "epoch": 1.4117796717974813, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9534224271774292, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8562573194503784, + "num_tokens": 423504327.0, + "step": 11098 + }, + { + "epoch": 1.4119068820760718, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0353305339813232, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8458343744277954, + "num_tokens": 423543114.0, + "step": 11099 + }, + { + "epoch": 1.4120340923546624, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.313847064971924, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8508321642875671, + "num_tokens": 423581727.0, + "step": 11100 + }, + { + "epoch": 1.412161302633253, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0707690715789795, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8692860007286072, + "num_tokens": 423617073.0, + "step": 11101 + }, + { + "epoch": 1.4122885129118432, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.72401762008667, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8691076040267944, + "num_tokens": 423664922.0, + "step": 11102 + }, + { + "epoch": 1.4124157231904337, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7441343069076538, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.855121910572052, + "num_tokens": 423706990.0, + "step": 11103 + }, + { + "epoch": 1.4125429334690243, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.83917498588562, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8464696407318115, + "num_tokens": 423745945.0, + "step": 11104 + }, + { + "epoch": 1.4126701437476148, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.991043210029602, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.870031476020813, + "num_tokens": 423780824.0, + "step": 11105 + }, + { + "epoch": 1.4127973540262053, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.886190414428711, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8775787353515625, + "num_tokens": 423818640.0, + "step": 11106 + }, + { + "epoch": 1.4129245643047958, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1047604084014893, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8729296326637268, + "num_tokens": 423856657.0, + "step": 11107 + }, + { + "epoch": 1.4130517745833864, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.759183406829834, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8759843111038208, + "num_tokens": 423900288.0, + "step": 11108 + }, + { + "epoch": 1.413178984861977, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4954662322998047, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8702957034111023, + "num_tokens": 423935929.0, + "step": 11109 + }, + { + "epoch": 1.4133061951405674, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9257782697677612, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8628129959106445, + "num_tokens": 423970719.0, + "step": 11110 + }, + { + "epoch": 1.4134334054191577, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.918459177017212, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8649183511734009, + "num_tokens": 424012338.0, + "step": 11111 + }, + { + "epoch": 1.4135606156977483, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8669102191925049, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.84740149974823, + "num_tokens": 424050112.0, + "step": 11112 + }, + { + "epoch": 1.4136878259763388, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8867772817611694, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8636385798454285, + "num_tokens": 424092688.0, + "step": 11113 + }, + { + "epoch": 1.4138150362549293, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.817688226699829, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8629886507987976, + "num_tokens": 424131355.0, + "step": 11114 + }, + { + "epoch": 1.4139422465335199, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7671763896942139, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8528403639793396, + "num_tokens": 424177768.0, + "step": 11115 + }, + { + "epoch": 1.4140694568121104, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0031161308288574, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8763821125030518, + "num_tokens": 424215834.0, + "step": 11116 + }, + { + "epoch": 1.414196667090701, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7144534587860107, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8682754039764404, + "num_tokens": 424260002.0, + "step": 11117 + }, + { + "epoch": 1.4143238773692914, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8012120723724365, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8620223999023438, + "num_tokens": 424299343.0, + "step": 11118 + }, + { + "epoch": 1.414451087647882, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7349268198013306, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8579610586166382, + "num_tokens": 424340312.0, + "step": 11119 + }, + { + "epoch": 1.4145782979264725, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8516695499420166, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8725506067276001, + "num_tokens": 424376658.0, + "step": 11120 + }, + { + "epoch": 1.414705508205063, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4952402114868164, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8759427070617676, + "num_tokens": 424415499.0, + "step": 11121 + }, + { + "epoch": 1.4148327184836536, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.906160831451416, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.865056037902832, + "num_tokens": 424453401.0, + "step": 11122 + }, + { + "epoch": 1.414959928762244, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9532785415649414, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8679373264312744, + "num_tokens": 424490654.0, + "step": 11123 + }, + { + "epoch": 1.4150871390408346, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8611520528793335, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8746106624603271, + "num_tokens": 424534469.0, + "step": 11124 + }, + { + "epoch": 1.4152143493194251, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.127027750015259, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8545006513595581, + "num_tokens": 424570187.0, + "step": 11125 + }, + { + "epoch": 1.4153415595980154, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9243972301483154, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8441684246063232, + "num_tokens": 424607898.0, + "step": 11126 + }, + { + "epoch": 1.415468769876606, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7416467666625977, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.866512656211853, + "num_tokens": 424652156.0, + "step": 11127 + }, + { + "epoch": 1.4155959801551965, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8826870918273926, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8552207350730896, + "num_tokens": 424688735.0, + "step": 11128 + }, + { + "epoch": 1.415723190433787, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7422521114349365, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.865774393081665, + "num_tokens": 424732859.0, + "step": 11129 + }, + { + "epoch": 1.4158504007123776, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0441043376922607, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8560575842857361, + "num_tokens": 424769381.0, + "step": 11130 + }, + { + "epoch": 1.415977610990968, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.055640697479248, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8651461601257324, + "num_tokens": 424802362.0, + "step": 11131 + }, + { + "epoch": 1.4161048212695586, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.876245141029358, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.860013484954834, + "num_tokens": 424837897.0, + "step": 11132 + }, + { + "epoch": 1.4162320315481491, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4135489463806152, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8757445812225342, + "num_tokens": 424878154.0, + "step": 11133 + }, + { + "epoch": 1.4163592418267397, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9441964626312256, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8688475489616394, + "num_tokens": 424915063.0, + "step": 11134 + }, + { + "epoch": 1.41648645210533, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8395329713821411, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8631823062896729, + "num_tokens": 424955159.0, + "step": 11135 + }, + { + "epoch": 1.4166136623839205, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9132505655288696, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8564323782920837, + "num_tokens": 424992573.0, + "step": 11136 + }, + { + "epoch": 1.416740872662511, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9487159252166748, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8687618374824524, + "num_tokens": 425030452.0, + "step": 11137 + }, + { + "epoch": 1.4168680829411016, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.891045331954956, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.877861499786377, + "num_tokens": 425067118.0, + "step": 11138 + }, + { + "epoch": 1.416995293219692, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9802472591400146, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8533402681350708, + "num_tokens": 425101684.0, + "step": 11139 + }, + { + "epoch": 1.4171225034982826, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8409628868103027, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8730726838111877, + "num_tokens": 425143127.0, + "step": 11140 + }, + { + "epoch": 1.4172497137768731, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.254535675048828, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8643056750297546, + "num_tokens": 425173912.0, + "step": 11141 + }, + { + "epoch": 1.4173769240554637, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9172337055206299, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8694533109664917, + "num_tokens": 425210146.0, + "step": 11142 + }, + { + "epoch": 1.4175041343340542, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8489009141921997, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8769762516021729, + "num_tokens": 425246082.0, + "step": 11143 + }, + { + "epoch": 1.4176313446126447, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8769539594650269, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.860608696937561, + "num_tokens": 425287470.0, + "step": 11144 + }, + { + "epoch": 1.4177585548912353, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9496245384216309, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8606467247009277, + "num_tokens": 425326553.0, + "step": 11145 + }, + { + "epoch": 1.4178857651698258, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3735756874084473, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8574662804603577, + "num_tokens": 425363105.0, + "step": 11146 + }, + { + "epoch": 1.4180129754484163, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8235993385314941, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.85841965675354, + "num_tokens": 425402513.0, + "step": 11147 + }, + { + "epoch": 1.4181401857270068, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8635748624801636, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8647792339324951, + "num_tokens": 425440132.0, + "step": 11148 + }, + { + "epoch": 1.4182673960055974, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8154762983322144, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.862520694732666, + "num_tokens": 425481738.0, + "step": 11149 + }, + { + "epoch": 1.418394606284188, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0658066272735596, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.862591564655304, + "num_tokens": 425516916.0, + "step": 11150 + }, + { + "epoch": 1.4185218165627782, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9226030111312866, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8632638454437256, + "num_tokens": 425555951.0, + "step": 11151 + }, + { + "epoch": 1.4186490268413687, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8706252574920654, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8637393712997437, + "num_tokens": 425595302.0, + "step": 11152 + }, + { + "epoch": 1.4187762371199593, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0006539821624756, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8666009306907654, + "num_tokens": 425623281.0, + "step": 11153 + }, + { + "epoch": 1.4189034473985498, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9750858545303345, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8733869791030884, + "num_tokens": 425659124.0, + "step": 11154 + }, + { + "epoch": 1.4190306576771403, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0336174964904785, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8500831127166748, + "num_tokens": 425695809.0, + "step": 11155 + }, + { + "epoch": 1.4191578679557308, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.516251564025879, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8536975383758545, + "num_tokens": 425731890.0, + "step": 11156 + }, + { + "epoch": 1.4192850782343214, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8822579383850098, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8718302845954895, + "num_tokens": 425770899.0, + "step": 11157 + }, + { + "epoch": 1.419412288512912, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9237937927246094, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8747543096542358, + "num_tokens": 425807812.0, + "step": 11158 + }, + { + "epoch": 1.4195394987915024, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7165441513061523, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.859195351600647, + "num_tokens": 425848997.0, + "step": 11159 + }, + { + "epoch": 1.4196667090700927, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9028127193450928, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8737667798995972, + "num_tokens": 425887816.0, + "step": 11160 + }, + { + "epoch": 1.4197939193486833, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9429762363433838, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.855710506439209, + "num_tokens": 425925488.0, + "step": 11161 + }, + { + "epoch": 1.4199211296272738, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7032356262207031, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8714349865913391, + "num_tokens": 425965616.0, + "step": 11162 + }, + { + "epoch": 1.4200483399058643, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9938631057739258, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8578404784202576, + "num_tokens": 426005308.0, + "step": 11163 + }, + { + "epoch": 1.4201755501844548, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0634024143218994, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8617445230484009, + "num_tokens": 426042339.0, + "step": 11164 + }, + { + "epoch": 1.4203027604630454, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9874589443206787, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8638468384742737, + "num_tokens": 426081925.0, + "step": 11165 + }, + { + "epoch": 1.420429970741636, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8289796113967896, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.858300507068634, + "num_tokens": 426120918.0, + "step": 11166 + }, + { + "epoch": 1.4205571810202264, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8196816444396973, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8532681465148926, + "num_tokens": 426158711.0, + "step": 11167 + }, + { + "epoch": 1.420684391298817, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1631383895874023, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.859656572341919, + "num_tokens": 426197034.0, + "step": 11168 + }, + { + "epoch": 1.4208116015774075, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9547408819198608, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8570203185081482, + "num_tokens": 426239390.0, + "step": 11169 + }, + { + "epoch": 1.420938811855998, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8324699401855469, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8530891537666321, + "num_tokens": 426285488.0, + "step": 11170 + }, + { + "epoch": 1.4210660221345885, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9096088409423828, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8505866527557373, + "num_tokens": 426323955.0, + "step": 11171 + }, + { + "epoch": 1.421193232413179, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8823198080062866, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8845034241676331, + "num_tokens": 426359460.0, + "step": 11172 + }, + { + "epoch": 1.4213204426917696, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.927791953086853, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8555538654327393, + "num_tokens": 426397104.0, + "step": 11173 + }, + { + "epoch": 1.4214476529703601, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7614986896514893, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8718130588531494, + "num_tokens": 426441349.0, + "step": 11174 + }, + { + "epoch": 1.4215748632489504, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7273976802825928, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8727647066116333, + "num_tokens": 426478410.0, + "step": 11175 + }, + { + "epoch": 1.421702073527541, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7434206008911133, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8631234169006348, + "num_tokens": 426519874.0, + "step": 11176 + }, + { + "epoch": 1.4218292838061315, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8454164266586304, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8515466451644897, + "num_tokens": 426559970.0, + "step": 11177 + }, + { + "epoch": 1.421956494084722, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9987881183624268, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8652181625366211, + "num_tokens": 426593897.0, + "step": 11178 + }, + { + "epoch": 1.4220837043633126, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0004591941833496, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8627337217330933, + "num_tokens": 426633712.0, + "step": 11179 + }, + { + "epoch": 1.422210914641903, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.955814003944397, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8786958456039429, + "num_tokens": 426669085.0, + "step": 11180 + }, + { + "epoch": 1.4223381249204936, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7785276174545288, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.877734899520874, + "num_tokens": 426709260.0, + "step": 11181 + }, + { + "epoch": 1.4224653351990841, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.84800386428833, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.852603554725647, + "num_tokens": 426747747.0, + "step": 11182 + }, + { + "epoch": 1.4225925454776747, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8837593793869019, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8502504825592041, + "num_tokens": 426788185.0, + "step": 11183 + }, + { + "epoch": 1.422719755756265, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0808680057525635, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8651291131973267, + "num_tokens": 426828663.0, + "step": 11184 + }, + { + "epoch": 1.4228469660348555, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9528896808624268, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8538544178009033, + "num_tokens": 426869205.0, + "step": 11185 + }, + { + "epoch": 1.422974176313446, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9709129333496094, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8791677951812744, + "num_tokens": 426900561.0, + "step": 11186 + }, + { + "epoch": 1.4231013865920366, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8559951782226562, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8794674277305603, + "num_tokens": 426940618.0, + "step": 11187 + }, + { + "epoch": 1.423228596870627, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0879967212677, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8554558753967285, + "num_tokens": 426975509.0, + "step": 11188 + }, + { + "epoch": 1.4233558071492176, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9784120321273804, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8725706934928894, + "num_tokens": 427011647.0, + "step": 11189 + }, + { + "epoch": 1.4234830174278081, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.991109848022461, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8613458871841431, + "num_tokens": 427046631.0, + "step": 11190 + }, + { + "epoch": 1.4236102277063987, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9291081428527832, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.86125248670578, + "num_tokens": 427081364.0, + "step": 11191 + }, + { + "epoch": 1.4237374379849892, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9362144470214844, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8640984296798706, + "num_tokens": 427118667.0, + "step": 11192 + }, + { + "epoch": 1.4238646482635797, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.125405788421631, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8684645891189575, + "num_tokens": 427155017.0, + "step": 11193 + }, + { + "epoch": 1.4239918585421703, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0051212310791016, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8569960594177246, + "num_tokens": 427191741.0, + "step": 11194 + }, + { + "epoch": 1.4241190688207608, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.020259380340576, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8677541017532349, + "num_tokens": 427226210.0, + "step": 11195 + }, + { + "epoch": 1.4242462790993513, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9644681215286255, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8609131574630737, + "num_tokens": 427261699.0, + "step": 11196 + }, + { + "epoch": 1.4243734893779418, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.292043924331665, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8546340465545654, + "num_tokens": 427297158.0, + "step": 11197 + }, + { + "epoch": 1.4245006996565324, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9055756330490112, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8477233052253723, + "num_tokens": 427338234.0, + "step": 11198 + }, + { + "epoch": 1.424627909935123, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.875966191291809, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8578510880470276, + "num_tokens": 427376727.0, + "step": 11199 + }, + { + "epoch": 1.4247551202137132, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9160410165786743, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8767673969268799, + "num_tokens": 427417095.0, + "step": 11200 + }, + { + "epoch": 1.4248823304923037, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9322065114974976, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8476018905639648, + "num_tokens": 427453243.0, + "step": 11201 + }, + { + "epoch": 1.4250095407708943, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9510009288787842, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8661683797836304, + "num_tokens": 427490961.0, + "step": 11202 + }, + { + "epoch": 1.4251367510494848, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9623184204101562, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8735672831535339, + "num_tokens": 427521487.0, + "step": 11203 + }, + { + "epoch": 1.4252639613280753, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.856870174407959, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8615761399269104, + "num_tokens": 427556607.0, + "step": 11204 + }, + { + "epoch": 1.4253911716066658, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.312995195388794, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8648543357849121, + "num_tokens": 427598940.0, + "step": 11205 + }, + { + "epoch": 1.4255183818852564, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.81122624874115, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8529491424560547, + "num_tokens": 427641337.0, + "step": 11206 + }, + { + "epoch": 1.425645592163847, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7584776878356934, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8754692077636719, + "num_tokens": 427680235.0, + "step": 11207 + }, + { + "epoch": 1.4257728024424374, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.829283595085144, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8649207353591919, + "num_tokens": 427719918.0, + "step": 11208 + }, + { + "epoch": 1.4259000127210277, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.858949899673462, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8662914633750916, + "num_tokens": 427758042.0, + "step": 11209 + }, + { + "epoch": 1.4260272229996183, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9266308546066284, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8655980825424194, + "num_tokens": 427794591.0, + "step": 11210 + }, + { + "epoch": 1.4261544332782088, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0259909629821777, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8715875744819641, + "num_tokens": 427828190.0, + "step": 11211 + }, + { + "epoch": 1.4262816435567993, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.774049162864685, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8565778732299805, + "num_tokens": 427870363.0, + "step": 11212 + }, + { + "epoch": 1.4264088538353898, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8896974325180054, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8790847063064575, + "num_tokens": 427910037.0, + "step": 11213 + }, + { + "epoch": 1.4265360641139804, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1210763454437256, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8632983565330505, + "num_tokens": 427945782.0, + "step": 11214 + }, + { + "epoch": 1.426663274392571, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7574125528335571, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8714238405227661, + "num_tokens": 427984635.0, + "step": 11215 + }, + { + "epoch": 1.4267904846711614, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7919583320617676, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8801149725914001, + "num_tokens": 428024289.0, + "step": 11216 + }, + { + "epoch": 1.426917694949752, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.877894639968872, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8562870621681213, + "num_tokens": 428067119.0, + "step": 11217 + }, + { + "epoch": 1.4270449052283425, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.87920343875885, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8617959022521973, + "num_tokens": 428108150.0, + "step": 11218 + }, + { + "epoch": 1.427172115506933, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0741922855377197, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8751632571220398, + "num_tokens": 428150839.0, + "step": 11219 + }, + { + "epoch": 1.4272993257855235, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7375301122665405, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8514732122421265, + "num_tokens": 428194325.0, + "step": 11220 + }, + { + "epoch": 1.427426536064114, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8912062644958496, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8626982569694519, + "num_tokens": 428230879.0, + "step": 11221 + }, + { + "epoch": 1.4275537463427046, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.055537223815918, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8761944770812988, + "num_tokens": 428265795.0, + "step": 11222 + }, + { + "epoch": 1.4276809566212951, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8542495965957642, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8698742985725403, + "num_tokens": 428300413.0, + "step": 11223 + }, + { + "epoch": 1.4278081668998854, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8885667324066162, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8856850862503052, + "num_tokens": 428339293.0, + "step": 11224 + }, + { + "epoch": 1.427935377178476, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8788623809814453, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8771373629570007, + "num_tokens": 428374158.0, + "step": 11225 + }, + { + "epoch": 1.4280625874570665, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8379532098770142, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8662334680557251, + "num_tokens": 428411715.0, + "step": 11226 + }, + { + "epoch": 1.428189797735657, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8311139345169067, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8527650237083435, + "num_tokens": 428450000.0, + "step": 11227 + }, + { + "epoch": 1.4283170080142475, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9129966497421265, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8709022998809814, + "num_tokens": 428484546.0, + "step": 11228 + }, + { + "epoch": 1.428444218292838, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9781290292739868, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8653681874275208, + "num_tokens": 428527519.0, + "step": 11229 + }, + { + "epoch": 1.4285714285714286, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.861851692199707, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8608537316322327, + "num_tokens": 428568744.0, + "step": 11230 + }, + { + "epoch": 1.4286986388500191, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7235292196273804, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.860551655292511, + "num_tokens": 428613050.0, + "step": 11231 + }, + { + "epoch": 1.4288258491286097, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9836488962173462, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8621374368667603, + "num_tokens": 428647334.0, + "step": 11232 + }, + { + "epoch": 1.4289530594072, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.94328773021698, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8714180588722229, + "num_tokens": 428681015.0, + "step": 11233 + }, + { + "epoch": 1.4290802696857905, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9204611778259277, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8571990728378296, + "num_tokens": 428718903.0, + "step": 11234 + }, + { + "epoch": 1.429207479964381, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9067928791046143, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8733845949172974, + "num_tokens": 428756038.0, + "step": 11235 + }, + { + "epoch": 1.4293346902429716, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9554460048675537, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8814741373062134, + "num_tokens": 428793908.0, + "step": 11236 + }, + { + "epoch": 1.429461900521562, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8965647220611572, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8582063913345337, + "num_tokens": 428830924.0, + "step": 11237 + }, + { + "epoch": 1.4295891108001526, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9166758060455322, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8610389232635498, + "num_tokens": 428869083.0, + "step": 11238 + }, + { + "epoch": 1.4297163210787431, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1990180015563965, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8398970365524292, + "num_tokens": 428903970.0, + "step": 11239 + }, + { + "epoch": 1.4298435313573337, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8685855865478516, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.868497371673584, + "num_tokens": 428940710.0, + "step": 11240 + }, + { + "epoch": 1.4299707416359242, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8125766515731812, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8682130575180054, + "num_tokens": 428981160.0, + "step": 11241 + }, + { + "epoch": 1.4300979519145147, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8543440103530884, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8653693795204163, + "num_tokens": 429022464.0, + "step": 11242 + }, + { + "epoch": 1.4302251621931052, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8352657556533813, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8646253943443298, + "num_tokens": 429060964.0, + "step": 11243 + }, + { + "epoch": 1.4303523724716958, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.715526819229126, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.865991473197937, + "num_tokens": 429101389.0, + "step": 11244 + }, + { + "epoch": 1.4304795827502863, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9611830711364746, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8608514666557312, + "num_tokens": 429135547.0, + "step": 11245 + }, + { + "epoch": 1.4306067930288768, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7049784660339355, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8671746253967285, + "num_tokens": 429177504.0, + "step": 11246 + }, + { + "epoch": 1.4307340033074674, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7605946063995361, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8683979511260986, + "num_tokens": 429217923.0, + "step": 11247 + }, + { + "epoch": 1.430861213586058, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3445332050323486, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8646700978279114, + "num_tokens": 429251514.0, + "step": 11248 + }, + { + "epoch": 1.4309884238646482, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.911577820777893, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8488681316375732, + "num_tokens": 429291747.0, + "step": 11249 + }, + { + "epoch": 1.4311156341432387, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8022511005401611, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.861878514289856, + "num_tokens": 429333480.0, + "step": 11250 + }, + { + "epoch": 1.4312428444218293, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.058061361312866, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.875742495059967, + "num_tokens": 429366482.0, + "step": 11251 + }, + { + "epoch": 1.4313700547004198, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8854037523269653, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8570276498794556, + "num_tokens": 429400704.0, + "step": 11252 + }, + { + "epoch": 1.4314972649790103, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.091825246810913, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8524627685546875, + "num_tokens": 429437463.0, + "step": 11253 + }, + { + "epoch": 1.4316244752576008, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8784722089767456, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8526815176010132, + "num_tokens": 429474496.0, + "step": 11254 + }, + { + "epoch": 1.4317516855361914, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.847161889076233, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8510164618492126, + "num_tokens": 429514641.0, + "step": 11255 + }, + { + "epoch": 1.431878895814782, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7811696529388428, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8647075891494751, + "num_tokens": 429554054.0, + "step": 11256 + }, + { + "epoch": 1.4320061060933724, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9220575094223022, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8621477484703064, + "num_tokens": 429594004.0, + "step": 11257 + }, + { + "epoch": 1.4321333163719627, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8254159688949585, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8712092638015747, + "num_tokens": 429633019.0, + "step": 11258 + }, + { + "epoch": 1.4322605266505533, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.031400442123413, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.86545729637146, + "num_tokens": 429666869.0, + "step": 11259 + }, + { + "epoch": 1.4323877369291438, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9723907709121704, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8747841119766235, + "num_tokens": 429700049.0, + "step": 11260 + }, + { + "epoch": 1.4325149472077343, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9798606634140015, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.861933171749115, + "num_tokens": 429737517.0, + "step": 11261 + }, + { + "epoch": 1.4326421574863248, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.748976230621338, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.877716600894928, + "num_tokens": 429778697.0, + "step": 11262 + }, + { + "epoch": 1.4327693677649154, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9259140491485596, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8641155958175659, + "num_tokens": 429811883.0, + "step": 11263 + }, + { + "epoch": 1.432896578043506, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0641329288482666, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8695803880691528, + "num_tokens": 429851748.0, + "step": 11264 + }, + { + "epoch": 1.4330237883220964, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9448504447937012, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8540869355201721, + "num_tokens": 429891033.0, + "step": 11265 + }, + { + "epoch": 1.433150998600687, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.865682601928711, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.867026686668396, + "num_tokens": 429935980.0, + "step": 11266 + }, + { + "epoch": 1.4332782088792775, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9459776878356934, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.852863609790802, + "num_tokens": 429970874.0, + "step": 11267 + }, + { + "epoch": 1.433405419157868, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8030850887298584, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8494069576263428, + "num_tokens": 430017909.0, + "step": 11268 + }, + { + "epoch": 1.4335326294364585, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.475039482116699, + "learning_rate": 1e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8383544683456421, + "num_tokens": 430050641.0, + "step": 11269 + }, + { + "epoch": 1.433659839715049, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9555612802505493, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8681812882423401, + "num_tokens": 430082985.0, + "step": 11270 + }, + { + "epoch": 1.4337870499936396, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8558614253997803, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8645511865615845, + "num_tokens": 430122340.0, + "step": 11271 + }, + { + "epoch": 1.4339142602722301, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.890918493270874, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8512113094329834, + "num_tokens": 430162140.0, + "step": 11272 + }, + { + "epoch": 1.4340414705508204, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8752855062484741, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8754618167877197, + "num_tokens": 430201672.0, + "step": 11273 + }, + { + "epoch": 1.434168680829411, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0620062351226807, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8528664112091064, + "num_tokens": 430239760.0, + "step": 11274 + }, + { + "epoch": 1.4342958911080015, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8037340641021729, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8458915948867798, + "num_tokens": 430280527.0, + "step": 11275 + }, + { + "epoch": 1.434423101386592, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9445099830627441, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8618401288986206, + "num_tokens": 430312788.0, + "step": 11276 + }, + { + "epoch": 1.4345503116651825, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9934691190719604, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.871562123298645, + "num_tokens": 430348044.0, + "step": 11277 + }, + { + "epoch": 1.434677521943773, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9549000263214111, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8803622722625732, + "num_tokens": 430385676.0, + "step": 11278 + }, + { + "epoch": 1.4348047322223636, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1727328300476074, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.860002875328064, + "num_tokens": 430416976.0, + "step": 11279 + }, + { + "epoch": 1.4349319425009541, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8607650995254517, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8748252391815186, + "num_tokens": 430460002.0, + "step": 11280 + }, + { + "epoch": 1.4350591527795447, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.76448655128479, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8427553176879883, + "num_tokens": 430503172.0, + "step": 11281 + }, + { + "epoch": 1.435186363058135, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8772172927856445, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8522202372550964, + "num_tokens": 430541221.0, + "step": 11282 + }, + { + "epoch": 1.4353135733367255, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.855099081993103, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8565898537635803, + "num_tokens": 430579527.0, + "step": 11283 + }, + { + "epoch": 1.435440783615316, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9499863386154175, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8662034869194031, + "num_tokens": 430617263.0, + "step": 11284 + }, + { + "epoch": 1.4355679938939065, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9861221313476562, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8768841028213501, + "num_tokens": 430649961.0, + "step": 11285 + }, + { + "epoch": 1.435695204172497, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1155660152435303, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8574655055999756, + "num_tokens": 430682599.0, + "step": 11286 + }, + { + "epoch": 1.4358224144510876, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9785789251327515, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8364115953445435, + "num_tokens": 430721014.0, + "step": 11287 + }, + { + "epoch": 1.4359496247296781, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1013808250427246, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8624240159988403, + "num_tokens": 430757014.0, + "step": 11288 + }, + { + "epoch": 1.4360768350082687, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8594486713409424, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8647630214691162, + "num_tokens": 430796169.0, + "step": 11289 + }, + { + "epoch": 1.4362040452868592, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9618631601333618, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8623605966567993, + "num_tokens": 430832726.0, + "step": 11290 + }, + { + "epoch": 1.4363312555654497, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.621522903442383, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.853683352470398, + "num_tokens": 430866343.0, + "step": 11291 + }, + { + "epoch": 1.4364584658440402, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0842528343200684, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8689531087875366, + "num_tokens": 430901622.0, + "step": 11292 + }, + { + "epoch": 1.4365856761226308, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.0262060165405273, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8758437633514404, + "num_tokens": 430937499.0, + "step": 11293 + }, + { + "epoch": 1.4367128864012213, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0847465991973877, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8511295318603516, + "num_tokens": 430974048.0, + "step": 11294 + }, + { + "epoch": 1.4368400966798118, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8547037839889526, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8718856573104858, + "num_tokens": 431010927.0, + "step": 11295 + }, + { + "epoch": 1.4369673069584024, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.5489532947540283, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8490138053894043, + "num_tokens": 431051731.0, + "step": 11296 + }, + { + "epoch": 1.4370945172369929, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.7744879722595215, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8795772790908813, + "num_tokens": 431087876.0, + "step": 11297 + }, + { + "epoch": 1.4372217275155832, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9911590814590454, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8569743037223816, + "num_tokens": 431126217.0, + "step": 11298 + }, + { + "epoch": 1.4373489377941737, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8500421047210693, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8642838001251221, + "num_tokens": 431168049.0, + "step": 11299 + }, + { + "epoch": 1.4374761480727642, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7237025499343872, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8612821102142334, + "num_tokens": 431211143.0, + "step": 11300 + }, + { + "epoch": 1.4376033583513548, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8234727382659912, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8703774809837341, + "num_tokens": 431255747.0, + "step": 11301 + }, + { + "epoch": 1.4377305686299453, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8390451669692993, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8668337464332581, + "num_tokens": 431296618.0, + "step": 11302 + }, + { + "epoch": 1.4378577789085358, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9270116090774536, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8642497658729553, + "num_tokens": 431336493.0, + "step": 11303 + }, + { + "epoch": 1.4379849891871264, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8873744010925293, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8608705997467041, + "num_tokens": 431376200.0, + "step": 11304 + }, + { + "epoch": 1.438112199465717, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9436455965042114, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8434154391288757, + "num_tokens": 431414015.0, + "step": 11305 + }, + { + "epoch": 1.4382394097443074, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.905076503753662, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8581849336624146, + "num_tokens": 431453498.0, + "step": 11306 + }, + { + "epoch": 1.4383666200228977, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.7459452152252197, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8634450435638428, + "num_tokens": 431491126.0, + "step": 11307 + }, + { + "epoch": 1.4384938303014883, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.024057388305664, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8862265348434448, + "num_tokens": 431528549.0, + "step": 11308 + }, + { + "epoch": 1.4386210405800788, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.017141580581665, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8703644871711731, + "num_tokens": 431569435.0, + "step": 11309 + }, + { + "epoch": 1.4387482508586693, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.855621337890625, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8699092864990234, + "num_tokens": 431606125.0, + "step": 11310 + }, + { + "epoch": 1.4388754611372598, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8776479959487915, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8589794635772705, + "num_tokens": 431643517.0, + "step": 11311 + }, + { + "epoch": 1.4390026714158504, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8256187438964844, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8722778558731079, + "num_tokens": 431682058.0, + "step": 11312 + }, + { + "epoch": 1.439129881694441, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7910484075546265, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8749985098838806, + "num_tokens": 431721241.0, + "step": 11313 + }, + { + "epoch": 1.4392570919730314, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 16.596248626708984, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8761698007583618, + "num_tokens": 431757809.0, + "step": 11314 + }, + { + "epoch": 1.439384302251622, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9213993549346924, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8521555662155151, + "num_tokens": 431795931.0, + "step": 11315 + }, + { + "epoch": 1.4395115125302125, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9229531288146973, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8729596138000488, + "num_tokens": 431830517.0, + "step": 11316 + }, + { + "epoch": 1.439638722808803, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1502883434295654, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.866687536239624, + "num_tokens": 431866047.0, + "step": 11317 + }, + { + "epoch": 1.4397659330873935, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9433571100234985, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8524752855300903, + "num_tokens": 431901602.0, + "step": 11318 + }, + { + "epoch": 1.439893143365984, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9210855960845947, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8603829145431519, + "num_tokens": 431934989.0, + "step": 11319 + }, + { + "epoch": 1.4400203536445746, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.044722557067871, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8681371808052063, + "num_tokens": 431968664.0, + "step": 11320 + }, + { + "epoch": 1.4401475639231651, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.93146550655365, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8747913837432861, + "num_tokens": 432008025.0, + "step": 11321 + }, + { + "epoch": 1.4402747742017554, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0502634048461914, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8664647340774536, + "num_tokens": 432043526.0, + "step": 11322 + }, + { + "epoch": 1.440401984480346, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8132829666137695, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8640857934951782, + "num_tokens": 432085175.0, + "step": 11323 + }, + { + "epoch": 1.4405291947589365, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.989901065826416, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8544133901596069, + "num_tokens": 432122379.0, + "step": 11324 + }, + { + "epoch": 1.440656405037527, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.090808391571045, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8663281798362732, + "num_tokens": 432157690.0, + "step": 11325 + }, + { + "epoch": 1.4407836153161175, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1644320487976074, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8726083040237427, + "num_tokens": 432188885.0, + "step": 11326 + }, + { + "epoch": 1.440910825594708, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9785284996032715, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8642818927764893, + "num_tokens": 432228804.0, + "step": 11327 + }, + { + "epoch": 1.4410380358732986, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9790247678756714, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.846341073513031, + "num_tokens": 432264150.0, + "step": 11328 + }, + { + "epoch": 1.4411652461518891, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9287716150283813, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8575900197029114, + "num_tokens": 432306759.0, + "step": 11329 + }, + { + "epoch": 1.4412924564304797, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.182805061340332, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.872734010219574, + "num_tokens": 432346928.0, + "step": 11330 + }, + { + "epoch": 1.44141966670907, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8757474422454834, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8714768886566162, + "num_tokens": 432386630.0, + "step": 11331 + }, + { + "epoch": 1.4415468769876605, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.798819899559021, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8515787124633789, + "num_tokens": 432427331.0, + "step": 11332 + }, + { + "epoch": 1.441674087266251, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9625763893127441, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8557180166244507, + "num_tokens": 432462901.0, + "step": 11333 + }, + { + "epoch": 1.4418012975448415, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9375736713409424, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8613779544830322, + "num_tokens": 432494232.0, + "step": 11334 + }, + { + "epoch": 1.441928507823432, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0913660526275635, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8764934539794922, + "num_tokens": 432534276.0, + "step": 11335 + }, + { + "epoch": 1.4420557181020226, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8528416156768799, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8551937341690063, + "num_tokens": 432577337.0, + "step": 11336 + }, + { + "epoch": 1.4421829283806131, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8337758779525757, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8558971881866455, + "num_tokens": 432614290.0, + "step": 11337 + }, + { + "epoch": 1.4423101386592037, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9108983278274536, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8644837141036987, + "num_tokens": 432648558.0, + "step": 11338 + }, + { + "epoch": 1.4424373489377942, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.024918556213379, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8370005488395691, + "num_tokens": 432684243.0, + "step": 11339 + }, + { + "epoch": 1.4425645592163847, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0952889919281006, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8629162907600403, + "num_tokens": 432719355.0, + "step": 11340 + }, + { + "epoch": 1.4426917694949752, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9277305603027344, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8653458952903748, + "num_tokens": 432758675.0, + "step": 11341 + }, + { + "epoch": 1.4428189797735658, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7987233400344849, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8598767518997192, + "num_tokens": 432801677.0, + "step": 11342 + }, + { + "epoch": 1.4429461900521563, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0547661781311035, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8628234267234802, + "num_tokens": 432836299.0, + "step": 11343 + }, + { + "epoch": 1.4430734003307468, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0350301265716553, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8616505265235901, + "num_tokens": 432868838.0, + "step": 11344 + }, + { + "epoch": 1.4432006106093374, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8421962261199951, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8648406267166138, + "num_tokens": 432904895.0, + "step": 11345 + }, + { + "epoch": 1.4433278208879279, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.85833740234375, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8583803176879883, + "num_tokens": 432944785.0, + "step": 11346 + }, + { + "epoch": 1.4434550311665182, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1434364318847656, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.852523922920227, + "num_tokens": 432975645.0, + "step": 11347 + }, + { + "epoch": 1.4435822414451087, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8653210401535034, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8577264547348022, + "num_tokens": 433016887.0, + "step": 11348 + }, + { + "epoch": 1.4437094517236992, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9960002899169922, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8647699356079102, + "num_tokens": 433053666.0, + "step": 11349 + }, + { + "epoch": 1.4438366620022898, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.757452368736267, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8585957288742065, + "num_tokens": 433101131.0, + "step": 11350 + }, + { + "epoch": 1.4439638722808803, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.963295340538025, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8705028295516968, + "num_tokens": 433132535.0, + "step": 11351 + }, + { + "epoch": 1.4440910825594708, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9657964706420898, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8493102192878723, + "num_tokens": 433168229.0, + "step": 11352 + }, + { + "epoch": 1.4442182928380614, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.847314476966858, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8671140670776367, + "num_tokens": 433206094.0, + "step": 11353 + }, + { + "epoch": 1.4443455031166519, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9345699548721313, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.852088987827301, + "num_tokens": 433241038.0, + "step": 11354 + }, + { + "epoch": 1.4444727133952424, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9823416471481323, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8556313514709473, + "num_tokens": 433274674.0, + "step": 11355 + }, + { + "epoch": 1.4445999236738327, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9377316236495972, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8665454387664795, + "num_tokens": 433314511.0, + "step": 11356 + }, + { + "epoch": 1.4447271339524232, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9325498342514038, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8649593591690063, + "num_tokens": 433349338.0, + "step": 11357 + }, + { + "epoch": 1.4448543442310138, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8844115734100342, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8727912902832031, + "num_tokens": 433382609.0, + "step": 11358 + }, + { + "epoch": 1.4449815545096043, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0263724327087402, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8784005045890808, + "num_tokens": 433411553.0, + "step": 11359 + }, + { + "epoch": 1.4451087647881948, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.016080617904663, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.85938560962677, + "num_tokens": 433450571.0, + "step": 11360 + }, + { + "epoch": 1.4452359750667854, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.065316677093506, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8581551909446716, + "num_tokens": 433486994.0, + "step": 11361 + }, + { + "epoch": 1.445363185345376, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8730344772338867, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8750708103179932, + "num_tokens": 433521407.0, + "step": 11362 + }, + { + "epoch": 1.4454903956239664, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9200727939605713, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8758343458175659, + "num_tokens": 433561338.0, + "step": 11363 + }, + { + "epoch": 1.445617605902557, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.037952184677124, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8713080883026123, + "num_tokens": 433598892.0, + "step": 11364 + }, + { + "epoch": 1.4457448161811475, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8772532939910889, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8537478446960449, + "num_tokens": 433641423.0, + "step": 11365 + }, + { + "epoch": 1.445872026459738, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8210047483444214, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8662559986114502, + "num_tokens": 433677385.0, + "step": 11366 + }, + { + "epoch": 1.4459992367383285, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9149830341339111, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8611897230148315, + "num_tokens": 433716901.0, + "step": 11367 + }, + { + "epoch": 1.446126447016919, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9416842460632324, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8600414991378784, + "num_tokens": 433755099.0, + "step": 11368 + }, + { + "epoch": 1.4462536572955096, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.904770851135254, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8591718673706055, + "num_tokens": 433794012.0, + "step": 11369 + }, + { + "epoch": 1.4463808675741001, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0649752616882324, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8675675988197327, + "num_tokens": 433831965.0, + "step": 11370 + }, + { + "epoch": 1.4465080778526904, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9943782091140747, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8637820482254028, + "num_tokens": 433870819.0, + "step": 11371 + }, + { + "epoch": 1.446635288131281, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9586540460586548, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8579152822494507, + "num_tokens": 433909197.0, + "step": 11372 + }, + { + "epoch": 1.4467624984098715, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8666523694992065, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8699559569358826, + "num_tokens": 433946026.0, + "step": 11373 + }, + { + "epoch": 1.446889708688462, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.695089340209961, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8854706287384033, + "num_tokens": 433986239.0, + "step": 11374 + }, + { + "epoch": 1.4470169189670525, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9734429121017456, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8685582280158997, + "num_tokens": 434022557.0, + "step": 11375 + }, + { + "epoch": 1.447144129245643, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.053774356842041, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8686505556106567, + "num_tokens": 434059258.0, + "step": 11376 + }, + { + "epoch": 1.4472713395242336, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.409013032913208, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8496489524841309, + "num_tokens": 434094872.0, + "step": 11377 + }, + { + "epoch": 1.4473985498028241, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0599873065948486, + "learning_rate": 1e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8394449353218079, + "num_tokens": 434134842.0, + "step": 11378 + }, + { + "epoch": 1.4475257600814146, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.836771845817566, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8750293254852295, + "num_tokens": 434173152.0, + "step": 11379 + }, + { + "epoch": 1.447652970360005, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9575992822647095, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8662878274917603, + "num_tokens": 434208318.0, + "step": 11380 + }, + { + "epoch": 1.4477801806385955, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8787909746170044, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8604713678359985, + "num_tokens": 434243693.0, + "step": 11381 + }, + { + "epoch": 1.447907390917186, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9967235326766968, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8528426885604858, + "num_tokens": 434283172.0, + "step": 11382 + }, + { + "epoch": 1.4480346011957765, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9054841995239258, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8638172149658203, + "num_tokens": 434318789.0, + "step": 11383 + }, + { + "epoch": 1.448161811474367, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9402121305465698, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8553887605667114, + "num_tokens": 434358943.0, + "step": 11384 + }, + { + "epoch": 1.4482890217529576, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8355077505111694, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8651002645492554, + "num_tokens": 434398264.0, + "step": 11385 + }, + { + "epoch": 1.4484162320315481, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.395805597305298, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8742702007293701, + "num_tokens": 434433602.0, + "step": 11386 + }, + { + "epoch": 1.4485434423101387, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7857223749160767, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8566795587539673, + "num_tokens": 434480001.0, + "step": 11387 + }, + { + "epoch": 1.4486706525887292, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8015490770339966, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8791635632514954, + "num_tokens": 434519285.0, + "step": 11388 + }, + { + "epoch": 1.4487978628673197, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9772850275039673, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8629454970359802, + "num_tokens": 434556118.0, + "step": 11389 + }, + { + "epoch": 1.4489250731459102, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8371055126190186, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8474650382995605, + "num_tokens": 434595350.0, + "step": 11390 + }, + { + "epoch": 1.4490522834245008, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.840943694114685, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8603802919387817, + "num_tokens": 434632988.0, + "step": 11391 + }, + { + "epoch": 1.4491794937030913, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.787675142288208, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8712237477302551, + "num_tokens": 434673040.0, + "step": 11392 + }, + { + "epoch": 1.4493067039816818, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8988193273544312, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8797657489776611, + "num_tokens": 434712201.0, + "step": 11393 + }, + { + "epoch": 1.4494339142602723, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7843682765960693, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8773231506347656, + "num_tokens": 434756676.0, + "step": 11394 + }, + { + "epoch": 1.4495611245388629, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7071294784545898, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8669308423995972, + "num_tokens": 434797890.0, + "step": 11395 + }, + { + "epoch": 1.4496883348174532, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.7981034517288208, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8758724927902222, + "num_tokens": 434836933.0, + "step": 11396 + }, + { + "epoch": 1.4498155450960437, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.006092071533203, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8763456344604492, + "num_tokens": 434868098.0, + "step": 11397 + }, + { + "epoch": 1.4499427553746342, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.832303762435913, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8590022325515747, + "num_tokens": 434907239.0, + "step": 11398 + }, + { + "epoch": 1.4500699656532248, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9969218969345093, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8700196743011475, + "num_tokens": 434945672.0, + "step": 11399 + }, + { + "epoch": 1.4501971759318153, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.941723346710205, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8536002039909363, + "num_tokens": 434981693.0, + "step": 11400 + }, + { + "epoch": 1.4503243862104058, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.866487979888916, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.885918915271759, + "num_tokens": 435012452.0, + "step": 11401 + }, + { + "epoch": 1.4504515964889964, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9307892322540283, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8580662608146667, + "num_tokens": 435052711.0, + "step": 11402 + }, + { + "epoch": 1.4505788067675869, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.805092453956604, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8623642921447754, + "num_tokens": 435092766.0, + "step": 11403 + }, + { + "epoch": 1.4507060170461774, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0355918407440186, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.848797082901001, + "num_tokens": 435132093.0, + "step": 11404 + }, + { + "epoch": 1.4508332273247677, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0491085052490234, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8587119579315186, + "num_tokens": 435169071.0, + "step": 11405 + }, + { + "epoch": 1.4509604376033582, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.235578775405884, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8724607229232788, + "num_tokens": 435202665.0, + "step": 11406 + }, + { + "epoch": 1.4510876478819488, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.866847276687622, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8783615231513977, + "num_tokens": 435234719.0, + "step": 11407 + }, + { + "epoch": 1.4512148581605393, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0962233543395996, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8787323832511902, + "num_tokens": 435267090.0, + "step": 11408 + }, + { + "epoch": 1.4513420684391298, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7594020366668701, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8720349073410034, + "num_tokens": 435307243.0, + "step": 11409 + }, + { + "epoch": 1.4514692787177204, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8932554721832275, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8718751668930054, + "num_tokens": 435347507.0, + "step": 11410 + }, + { + "epoch": 1.4515964889963109, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8694573640823364, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8548014163970947, + "num_tokens": 435383499.0, + "step": 11411 + }, + { + "epoch": 1.4517236992749014, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8276004791259766, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8644399642944336, + "num_tokens": 435423500.0, + "step": 11412 + }, + { + "epoch": 1.451850909553492, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.00052547454834, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8671993017196655, + "num_tokens": 435457075.0, + "step": 11413 + }, + { + "epoch": 1.4519781198320825, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.757940649986267, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.863228976726532, + "num_tokens": 435498933.0, + "step": 11414 + }, + { + "epoch": 1.452105330110673, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9040924310684204, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8559523224830627, + "num_tokens": 435535949.0, + "step": 11415 + }, + { + "epoch": 1.4522325403892635, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.423778533935547, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8565854430198669, + "num_tokens": 435565252.0, + "step": 11416 + }, + { + "epoch": 1.452359750667854, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9519741535186768, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8451298475265503, + "num_tokens": 435604663.0, + "step": 11417 + }, + { + "epoch": 1.4524869609464446, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0173182487487793, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8595858812332153, + "num_tokens": 435640883.0, + "step": 11418 + }, + { + "epoch": 1.4526141712250351, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8711860179901123, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8676767945289612, + "num_tokens": 435682787.0, + "step": 11419 + }, + { + "epoch": 1.4527413815036254, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9121716022491455, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8626426458358765, + "num_tokens": 435719982.0, + "step": 11420 + }, + { + "epoch": 1.452868591782216, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0122311115264893, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8463026285171509, + "num_tokens": 435755153.0, + "step": 11421 + }, + { + "epoch": 1.4529958020608065, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.742675542831421, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8728817701339722, + "num_tokens": 435797513.0, + "step": 11422 + }, + { + "epoch": 1.453123012339397, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8494246006011963, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8631417155265808, + "num_tokens": 435836835.0, + "step": 11423 + }, + { + "epoch": 1.4532502226179875, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1367435455322266, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8548487424850464, + "num_tokens": 435875124.0, + "step": 11424 + }, + { + "epoch": 1.453377432896578, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.76224684715271, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8762391805648804, + "num_tokens": 435914993.0, + "step": 11425 + }, + { + "epoch": 1.4535046431751686, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0690693855285645, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8722407817840576, + "num_tokens": 435950916.0, + "step": 11426 + }, + { + "epoch": 1.4536318534537591, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8155884742736816, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8575254082679749, + "num_tokens": 435991986.0, + "step": 11427 + }, + { + "epoch": 1.4537590637323496, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.923053503036499, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.871428370475769, + "num_tokens": 436024603.0, + "step": 11428 + }, + { + "epoch": 1.45388627401094, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7525330781936646, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.860144853591919, + "num_tokens": 436065540.0, + "step": 11429 + }, + { + "epoch": 1.4540134842895305, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.064565420150757, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8508970141410828, + "num_tokens": 436104394.0, + "step": 11430 + }, + { + "epoch": 1.454140694568121, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0118768215179443, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.850131630897522, + "num_tokens": 436139977.0, + "step": 11431 + }, + { + "epoch": 1.4542679048467115, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.004843235015869, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.87935471534729, + "num_tokens": 436178047.0, + "step": 11432 + }, + { + "epoch": 1.454395115125302, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7870805263519287, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8626457452774048, + "num_tokens": 436218331.0, + "step": 11433 + }, + { + "epoch": 1.4545223254038926, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9166216850280762, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8626121282577515, + "num_tokens": 436255663.0, + "step": 11434 + }, + { + "epoch": 1.4546495356824831, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6680395603179932, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8642802238464355, + "num_tokens": 436301328.0, + "step": 11435 + }, + { + "epoch": 1.4547767459610736, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9762414693832397, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8719502687454224, + "num_tokens": 436336487.0, + "step": 11436 + }, + { + "epoch": 1.4549039562396642, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8403842449188232, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8657760620117188, + "num_tokens": 436377629.0, + "step": 11437 + }, + { + "epoch": 1.4550311665182547, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7377924919128418, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8788430690765381, + "num_tokens": 436421590.0, + "step": 11438 + }, + { + "epoch": 1.4551583767968452, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7941172122955322, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8558017015457153, + "num_tokens": 436461172.0, + "step": 11439 + }, + { + "epoch": 1.4552855870754358, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7576051950454712, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8728013038635254, + "num_tokens": 436497536.0, + "step": 11440 + }, + { + "epoch": 1.4554127973540263, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.757716178894043, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8615569472312927, + "num_tokens": 436543906.0, + "step": 11441 + }, + { + "epoch": 1.4555400076326168, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.099745273590088, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8667570352554321, + "num_tokens": 436579462.0, + "step": 11442 + }, + { + "epoch": 1.4556672179112073, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8630019426345825, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8564022779464722, + "num_tokens": 436618840.0, + "step": 11443 + }, + { + "epoch": 1.4557944281897979, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.916579008102417, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8520218133926392, + "num_tokens": 436654176.0, + "step": 11444 + }, + { + "epoch": 1.4559216384683882, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9316372871398926, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8779825568199158, + "num_tokens": 436685507.0, + "step": 11445 + }, + { + "epoch": 1.4560488487469787, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7940746545791626, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8712680339813232, + "num_tokens": 436729881.0, + "step": 11446 + }, + { + "epoch": 1.4561760590255692, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9208157062530518, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8626422882080078, + "num_tokens": 436772355.0, + "step": 11447 + }, + { + "epoch": 1.4563032693041598, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8240982294082642, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8577023148536682, + "num_tokens": 436808701.0, + "step": 11448 + }, + { + "epoch": 1.4564304795827503, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8910982608795166, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8728200197219849, + "num_tokens": 436845441.0, + "step": 11449 + }, + { + "epoch": 1.4565576898613408, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.836692214012146, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8668590784072876, + "num_tokens": 436885316.0, + "step": 11450 + }, + { + "epoch": 1.4566849001399313, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.976006269454956, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8580042719841003, + "num_tokens": 436918315.0, + "step": 11451 + }, + { + "epoch": 1.4568121104185219, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9209760427474976, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8568406105041504, + "num_tokens": 436956039.0, + "step": 11452 + }, + { + "epoch": 1.4569393206971124, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9758964776992798, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8498871922492981, + "num_tokens": 436991274.0, + "step": 11453 + }, + { + "epoch": 1.4570665309757027, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8185218572616577, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8754851818084717, + "num_tokens": 437029963.0, + "step": 11454 + }, + { + "epoch": 1.4571937412542932, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0190112590789795, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8570528626441956, + "num_tokens": 437067030.0, + "step": 11455 + }, + { + "epoch": 1.4573209515328838, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8413993120193481, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8704379200935364, + "num_tokens": 437103363.0, + "step": 11456 + }, + { + "epoch": 1.4574481618114743, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.039921283721924, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8710712194442749, + "num_tokens": 437131574.0, + "step": 11457 + }, + { + "epoch": 1.4575753720900648, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8053500652313232, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8579311370849609, + "num_tokens": 437169537.0, + "step": 11458 + }, + { + "epoch": 1.4577025823686554, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8586739301681519, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8680155277252197, + "num_tokens": 437208025.0, + "step": 11459 + }, + { + "epoch": 1.4578297926472459, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9230685234069824, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8746196627616882, + "num_tokens": 437246642.0, + "step": 11460 + }, + { + "epoch": 1.4579570029258364, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.797254204750061, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8602696657180786, + "num_tokens": 437285274.0, + "step": 11461 + }, + { + "epoch": 1.458084213204427, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8216098546981812, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8807240128517151, + "num_tokens": 437325865.0, + "step": 11462 + }, + { + "epoch": 1.4582114234830175, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8324635028839111, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8693233132362366, + "num_tokens": 437364093.0, + "step": 11463 + }, + { + "epoch": 1.458338633761608, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8924903869628906, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.851476788520813, + "num_tokens": 437402413.0, + "step": 11464 + }, + { + "epoch": 1.4584658440401985, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9682575464248657, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8507155179977417, + "num_tokens": 437442583.0, + "step": 11465 + }, + { + "epoch": 1.458593054318789, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.005734443664551, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8639193773269653, + "num_tokens": 437485682.0, + "step": 11466 + }, + { + "epoch": 1.4587202645973796, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8320201635360718, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8545792102813721, + "num_tokens": 437524859.0, + "step": 11467 + }, + { + "epoch": 1.45884747487597, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0003366470336914, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8558462262153625, + "num_tokens": 437563590.0, + "step": 11468 + }, + { + "epoch": 1.4589746851545604, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.84185791015625, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8802124261856079, + "num_tokens": 437601429.0, + "step": 11469 + }, + { + "epoch": 1.459101895433151, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.957170009613037, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8509069681167603, + "num_tokens": 437639144.0, + "step": 11470 + }, + { + "epoch": 1.4592291057117415, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7630337476730347, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8723616600036621, + "num_tokens": 437680846.0, + "step": 11471 + }, + { + "epoch": 1.459356315990332, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8678313493728638, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8659752607345581, + "num_tokens": 437722156.0, + "step": 11472 + }, + { + "epoch": 1.4594835262689225, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7956898212432861, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8411730527877808, + "num_tokens": 437766905.0, + "step": 11473 + }, + { + "epoch": 1.459610736547513, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9437475204467773, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8639163970947266, + "num_tokens": 437801425.0, + "step": 11474 + }, + { + "epoch": 1.4597379468261036, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.114612102508545, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.854154109954834, + "num_tokens": 437838126.0, + "step": 11475 + }, + { + "epoch": 1.4598651571046941, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9722081422805786, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8588784337043762, + "num_tokens": 437876597.0, + "step": 11476 + }, + { + "epoch": 1.4599923673832846, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8234336376190186, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8757497072219849, + "num_tokens": 437911852.0, + "step": 11477 + }, + { + "epoch": 1.460119577661875, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7272065877914429, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8641037344932556, + "num_tokens": 437953418.0, + "step": 11478 + }, + { + "epoch": 1.4602467879404655, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8808361291885376, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8704477548599243, + "num_tokens": 437989561.0, + "step": 11479 + }, + { + "epoch": 1.460373998219056, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8373169898986816, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.872832179069519, + "num_tokens": 438033464.0, + "step": 11480 + }, + { + "epoch": 1.4605012084976465, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8105610609054565, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8674483299255371, + "num_tokens": 438068676.0, + "step": 11481 + }, + { + "epoch": 1.460628418776237, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0238728523254395, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8466624021530151, + "num_tokens": 438107867.0, + "step": 11482 + }, + { + "epoch": 1.4607556290548276, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8554701805114746, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8797045946121216, + "num_tokens": 438144256.0, + "step": 11483 + }, + { + "epoch": 1.4608828393334181, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.735019326210022, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8731458783149719, + "num_tokens": 438183458.0, + "step": 11484 + }, + { + "epoch": 1.4610100496120086, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0734803676605225, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8515999913215637, + "num_tokens": 438216025.0, + "step": 11485 + }, + { + "epoch": 1.4611372598905992, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8283216953277588, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8747067451477051, + "num_tokens": 438258185.0, + "step": 11486 + }, + { + "epoch": 1.4612644701691897, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.881312608718872, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8594873547554016, + "num_tokens": 438298189.0, + "step": 11487 + }, + { + "epoch": 1.4613916804477802, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.037943124771118, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8528107404708862, + "num_tokens": 438336593.0, + "step": 11488 + }, + { + "epoch": 1.4615188907263708, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8224786520004272, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8701446056365967, + "num_tokens": 438374110.0, + "step": 11489 + }, + { + "epoch": 1.4616461010049613, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8941911458969116, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8467320203781128, + "num_tokens": 438413062.0, + "step": 11490 + }, + { + "epoch": 1.4617733112835518, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9157161712646484, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8666868805885315, + "num_tokens": 438452055.0, + "step": 11491 + }, + { + "epoch": 1.4619005215621423, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8187618255615234, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8659060001373291, + "num_tokens": 438494224.0, + "step": 11492 + }, + { + "epoch": 1.4620277318407329, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0886588096618652, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8601787090301514, + "num_tokens": 438529790.0, + "step": 11493 + }, + { + "epoch": 1.4621549421193232, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8220279216766357, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8674135208129883, + "num_tokens": 438571371.0, + "step": 11494 + }, + { + "epoch": 1.4622821523979137, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.885617971420288, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8503039479255676, + "num_tokens": 438612383.0, + "step": 11495 + }, + { + "epoch": 1.4624093626765042, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8725321292877197, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8536930084228516, + "num_tokens": 438653443.0, + "step": 11496 + }, + { + "epoch": 1.4625365729550948, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0204460620880127, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8515774011611938, + "num_tokens": 438689565.0, + "step": 11497 + }, + { + "epoch": 1.4626637832336853, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9248734712600708, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8639830946922302, + "num_tokens": 438727753.0, + "step": 11498 + }, + { + "epoch": 1.4627909935122758, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9679436683654785, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8626021146774292, + "num_tokens": 438764667.0, + "step": 11499 + }, + { + "epoch": 1.4629182037908663, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9016311168670654, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8703293800354004, + "num_tokens": 438801739.0, + "step": 11500 + }, + { + "epoch": 1.4630454140694569, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9085795879364014, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8649722337722778, + "num_tokens": 438840028.0, + "step": 11501 + }, + { + "epoch": 1.4631726243480474, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8041037321090698, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8622571229934692, + "num_tokens": 438880632.0, + "step": 11502 + }, + { + "epoch": 1.4632998346266377, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9703090190887451, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.860544741153717, + "num_tokens": 438923040.0, + "step": 11503 + }, + { + "epoch": 1.4634270449052282, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.945424199104309, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8637456893920898, + "num_tokens": 438958597.0, + "step": 11504 + }, + { + "epoch": 1.4635542551838188, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7578911781311035, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8521120548248291, + "num_tokens": 438997909.0, + "step": 11505 + }, + { + "epoch": 1.4636814654624093, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9454141855239868, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.872021496295929, + "num_tokens": 439030084.0, + "step": 11506 + }, + { + "epoch": 1.4638086757409998, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8294028043746948, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8665693402290344, + "num_tokens": 439067770.0, + "step": 11507 + }, + { + "epoch": 1.4639358860195903, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9229352474212646, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8657010793685913, + "num_tokens": 439102476.0, + "step": 11508 + }, + { + "epoch": 1.4640630962981809, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7543939352035522, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8734001517295837, + "num_tokens": 439138660.0, + "step": 11509 + }, + { + "epoch": 1.4641903065767714, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.72059166431427, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8713917136192322, + "num_tokens": 439181690.0, + "step": 11510 + }, + { + "epoch": 1.464317516855362, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8197945356369019, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.870040774345398, + "num_tokens": 439218825.0, + "step": 11511 + }, + { + "epoch": 1.4644447271339525, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9006779193878174, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8696302175521851, + "num_tokens": 439253505.0, + "step": 11512 + }, + { + "epoch": 1.464571937412543, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9643397331237793, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8583979606628418, + "num_tokens": 439292764.0, + "step": 11513 + }, + { + "epoch": 1.4646991476911335, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7346923351287842, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8734642863273621, + "num_tokens": 439335269.0, + "step": 11514 + }, + { + "epoch": 1.464826357969724, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8337255716323853, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8662897348403931, + "num_tokens": 439371977.0, + "step": 11515 + }, + { + "epoch": 1.4649535682483146, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9429993629455566, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.864014208316803, + "num_tokens": 439412710.0, + "step": 11516 + }, + { + "epoch": 1.465080778526905, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8841391801834106, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8627669811248779, + "num_tokens": 439449637.0, + "step": 11517 + }, + { + "epoch": 1.4652079888054954, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7192144393920898, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8588020205497742, + "num_tokens": 439495143.0, + "step": 11518 + }, + { + "epoch": 1.465335199084086, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9417144060134888, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8519036173820496, + "num_tokens": 439532721.0, + "step": 11519 + }, + { + "epoch": 1.4654624093626765, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8460323810577393, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.872949481010437, + "num_tokens": 439567938.0, + "step": 11520 + }, + { + "epoch": 1.465589619641267, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1990649700164795, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8730124235153198, + "num_tokens": 439595720.0, + "step": 11521 + }, + { + "epoch": 1.4657168299198575, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0237157344818115, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8644716739654541, + "num_tokens": 439634091.0, + "step": 11522 + }, + { + "epoch": 1.465844040198448, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1713707447052, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8529226779937744, + "num_tokens": 439671978.0, + "step": 11523 + }, + { + "epoch": 1.4659712504770386, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.000439405441284, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8588300943374634, + "num_tokens": 439707748.0, + "step": 11524 + }, + { + "epoch": 1.466098460755629, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8698102235794067, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8709516525268555, + "num_tokens": 439744060.0, + "step": 11525 + }, + { + "epoch": 1.4662256710342196, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7687433958053589, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8563530445098877, + "num_tokens": 439789277.0, + "step": 11526 + }, + { + "epoch": 1.46635288131281, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4422800540924072, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.881019115447998, + "num_tokens": 439827524.0, + "step": 11527 + }, + { + "epoch": 1.4664800915914005, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.547844409942627, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8594345450401306, + "num_tokens": 439865181.0, + "step": 11528 + }, + { + "epoch": 1.466607301869991, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1794657707214355, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8556908369064331, + "num_tokens": 439900071.0, + "step": 11529 + }, + { + "epoch": 1.4667345121485815, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8600118160247803, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8621941804885864, + "num_tokens": 439940839.0, + "step": 11530 + }, + { + "epoch": 1.466861722427172, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8778358697891235, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8609974980354309, + "num_tokens": 439978908.0, + "step": 11531 + }, + { + "epoch": 1.4669889327057626, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9381771087646484, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8626194000244141, + "num_tokens": 440013448.0, + "step": 11532 + }, + { + "epoch": 1.467116142984353, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9793919324874878, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8672271966934204, + "num_tokens": 440048377.0, + "step": 11533 + }, + { + "epoch": 1.4672433532629436, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7722259759902954, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8579713106155396, + "num_tokens": 440089806.0, + "step": 11534 + }, + { + "epoch": 1.4673705635415342, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0594377517700195, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8689645528793335, + "num_tokens": 440124209.0, + "step": 11535 + }, + { + "epoch": 1.4674977738201247, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0183303356170654, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8653908371925354, + "num_tokens": 440158349.0, + "step": 11536 + }, + { + "epoch": 1.4676249840987152, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.267545461654663, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8656861782073975, + "num_tokens": 440195220.0, + "step": 11537 + }, + { + "epoch": 1.4677521943773058, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8424769639968872, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.863976240158081, + "num_tokens": 440235158.0, + "step": 11538 + }, + { + "epoch": 1.4678794046558963, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.011568307876587, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8580946922302246, + "num_tokens": 440271879.0, + "step": 11539 + }, + { + "epoch": 1.4680066149344868, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9864270687103271, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8618266582489014, + "num_tokens": 440306454.0, + "step": 11540 + }, + { + "epoch": 1.4681338252130773, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.016274929046631, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8521988391876221, + "num_tokens": 440341185.0, + "step": 11541 + }, + { + "epoch": 1.4682610354916679, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2322018146514893, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8619680404663086, + "num_tokens": 440379168.0, + "step": 11542 + }, + { + "epoch": 1.4683882457702582, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.78639817237854, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8662062287330627, + "num_tokens": 440417945.0, + "step": 11543 + }, + { + "epoch": 1.4685154560488487, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8495324850082397, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8485746383666992, + "num_tokens": 440457864.0, + "step": 11544 + }, + { + "epoch": 1.4686426663274392, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0255911350250244, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8582459688186646, + "num_tokens": 440491709.0, + "step": 11545 + }, + { + "epoch": 1.4687698766060298, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7917848825454712, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8696286082267761, + "num_tokens": 440533570.0, + "step": 11546 + }, + { + "epoch": 1.4688970868846203, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1206960678100586, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8687737584114075, + "num_tokens": 440568360.0, + "step": 11547 + }, + { + "epoch": 1.4690242971632108, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.839920163154602, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.873553454875946, + "num_tokens": 440606848.0, + "step": 11548 + }, + { + "epoch": 1.4691515074418013, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.065415620803833, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8762184977531433, + "num_tokens": 440638254.0, + "step": 11549 + }, + { + "epoch": 1.4692787177203919, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9104318618774414, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.871086835861206, + "num_tokens": 440674122.0, + "step": 11550 + }, + { + "epoch": 1.4694059279989824, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7453364133834839, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8781936168670654, + "num_tokens": 440718362.0, + "step": 11551 + }, + { + "epoch": 1.4695331382775727, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9881242513656616, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8709654211997986, + "num_tokens": 440755971.0, + "step": 11552 + }, + { + "epoch": 1.4696603485561632, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9028068780899048, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8613556623458862, + "num_tokens": 440794328.0, + "step": 11553 + }, + { + "epoch": 1.4697875588347538, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0317139625549316, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8514663577079773, + "num_tokens": 440833383.0, + "step": 11554 + }, + { + "epoch": 1.4699147691133443, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7883070707321167, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8708713054656982, + "num_tokens": 440875502.0, + "step": 11555 + }, + { + "epoch": 1.4700419793919348, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8239020109176636, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8672683238983154, + "num_tokens": 440915259.0, + "step": 11556 + }, + { + "epoch": 1.4701691896705253, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8440139293670654, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8616518378257751, + "num_tokens": 440951660.0, + "step": 11557 + }, + { + "epoch": 1.4702963999491159, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 3.51766300201416, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8640560507774353, + "num_tokens": 440992533.0, + "step": 11558 + }, + { + "epoch": 1.4704236102277064, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.981683373451233, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8527613878250122, + "num_tokens": 441027360.0, + "step": 11559 + }, + { + "epoch": 1.470550820506297, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7561899423599243, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8700708150863647, + "num_tokens": 441071746.0, + "step": 11560 + }, + { + "epoch": 1.4706780307848875, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.61666488647461, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8621665239334106, + "num_tokens": 441105378.0, + "step": 11561 + }, + { + "epoch": 1.470805241063478, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8878501653671265, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8651580810546875, + "num_tokens": 441150000.0, + "step": 11562 + }, + { + "epoch": 1.4709324513420685, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7708773612976074, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8571382761001587, + "num_tokens": 441188426.0, + "step": 11563 + }, + { + "epoch": 1.471059661620659, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9014122486114502, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8703819513320923, + "num_tokens": 441226028.0, + "step": 11564 + }, + { + "epoch": 1.4711868718992496, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9782227277755737, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8648753762245178, + "num_tokens": 441265788.0, + "step": 11565 + }, + { + "epoch": 1.47131408217784, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9140084981918335, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8728336095809937, + "num_tokens": 441297280.0, + "step": 11566 + }, + { + "epoch": 1.4714412924564304, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.919358253479004, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.864630937576294, + "num_tokens": 441331229.0, + "step": 11567 + }, + { + "epoch": 1.471568502735021, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.778571605682373, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8690311908721924, + "num_tokens": 441371064.0, + "step": 11568 + }, + { + "epoch": 1.4716957130136115, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9418996572494507, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8460456132888794, + "num_tokens": 441405706.0, + "step": 11569 + }, + { + "epoch": 1.471822923292202, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8260483741760254, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8549858331680298, + "num_tokens": 441445966.0, + "step": 11570 + }, + { + "epoch": 1.4719501335707925, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.299567461013794, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.848493218421936, + "num_tokens": 441478916.0, + "step": 11571 + }, + { + "epoch": 1.472077343849383, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.038468599319458, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.860406219959259, + "num_tokens": 441519245.0, + "step": 11572 + }, + { + "epoch": 1.4722045541279736, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.974483847618103, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8568307757377625, + "num_tokens": 441557485.0, + "step": 11573 + }, + { + "epoch": 1.472331764406564, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1042423248291016, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8747944831848145, + "num_tokens": 441596042.0, + "step": 11574 + }, + { + "epoch": 1.4724589746851546, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.044769287109375, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8740638494491577, + "num_tokens": 441634391.0, + "step": 11575 + }, + { + "epoch": 1.472586184963745, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9708455801010132, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8563787937164307, + "num_tokens": 441672201.0, + "step": 11576 + }, + { + "epoch": 1.4727133952423355, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8692594766616821, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8621350526809692, + "num_tokens": 441713350.0, + "step": 11577 + }, + { + "epoch": 1.472840605520926, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7382593154907227, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8624678254127502, + "num_tokens": 441754677.0, + "step": 11578 + }, + { + "epoch": 1.4729678157995165, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8108928203582764, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8531486988067627, + "num_tokens": 441797037.0, + "step": 11579 + }, + { + "epoch": 1.473095026078107, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2114241123199463, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8625988364219666, + "num_tokens": 441831661.0, + "step": 11580 + }, + { + "epoch": 1.4732222363566976, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.960623860359192, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.867385983467102, + "num_tokens": 441876933.0, + "step": 11581 + }, + { + "epoch": 1.473349446635288, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7311391830444336, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8848943710327148, + "num_tokens": 441918015.0, + "step": 11582 + }, + { + "epoch": 1.4734766569138786, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 5.115583896636963, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8571997284889221, + "num_tokens": 441956573.0, + "step": 11583 + }, + { + "epoch": 1.4736038671924692, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9684851169586182, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8769710659980774, + "num_tokens": 442002064.0, + "step": 11584 + }, + { + "epoch": 1.4737310774710597, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.918765902519226, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8703814744949341, + "num_tokens": 442040122.0, + "step": 11585 + }, + { + "epoch": 1.4738582877496502, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0059168338775635, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8670720458030701, + "num_tokens": 442079698.0, + "step": 11586 + }, + { + "epoch": 1.4739854980282407, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8039827346801758, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8770626783370972, + "num_tokens": 442118252.0, + "step": 11587 + }, + { + "epoch": 1.4741127083068313, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.250269651412964, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8687878251075745, + "num_tokens": 442154675.0, + "step": 11588 + }, + { + "epoch": 1.4742399185854218, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0446951389312744, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8708615303039551, + "num_tokens": 442189449.0, + "step": 11589 + }, + { + "epoch": 1.4743671288640123, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.963125467300415, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8676390647888184, + "num_tokens": 442222844.0, + "step": 11590 + }, + { + "epoch": 1.4744943391426026, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7752902507781982, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8713001012802124, + "num_tokens": 442258599.0, + "step": 11591 + }, + { + "epoch": 1.4746215494211932, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9299968481063843, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8716244697570801, + "num_tokens": 442294775.0, + "step": 11592 + }, + { + "epoch": 1.4747487596997837, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9916878938674927, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8742707967758179, + "num_tokens": 442330313.0, + "step": 11593 + }, + { + "epoch": 1.4748759699783742, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0986008644104004, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8634908199310303, + "num_tokens": 442364988.0, + "step": 11594 + }, + { + "epoch": 1.4750031802569648, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9518338441848755, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8652870655059814, + "num_tokens": 442406927.0, + "step": 11595 + }, + { + "epoch": 1.4751303905355553, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9899017810821533, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8681360483169556, + "num_tokens": 442442092.0, + "step": 11596 + }, + { + "epoch": 1.4752576008141458, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8580821752548218, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8723220825195312, + "num_tokens": 442483013.0, + "step": 11597 + }, + { + "epoch": 1.4753848110927363, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7573506832122803, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.869012176990509, + "num_tokens": 442525052.0, + "step": 11598 + }, + { + "epoch": 1.4755120213713269, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8970810174942017, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8555479049682617, + "num_tokens": 442562543.0, + "step": 11599 + }, + { + "epoch": 1.4756392316499174, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.9507883787155151, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8692674040794373, + "num_tokens": 442604976.0, + "step": 11600 + }, + { + "epoch": 1.4757664419285077, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8048118352890015, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8766757249832153, + "num_tokens": 442642271.0, + "step": 11601 + }, + { + "epoch": 1.4758936522070982, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 1.9923295974731445, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8610389232635498, + "num_tokens": 442680508.0, + "step": 11602 + }, + { + "epoch": 1.4760208624856888, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.5295934677124023, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8542657494544983, + "num_tokens": 442718391.0, + "step": 11603 + }, + { + "epoch": 1.4761480727642793, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9339313507080078, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8743144273757935, + "num_tokens": 442757688.0, + "step": 11604 + }, + { + "epoch": 1.4762752830428698, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9631575345993042, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8774299025535583, + "num_tokens": 442796704.0, + "step": 11605 + }, + { + "epoch": 1.4764024933214603, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8475253582000732, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8744865655899048, + "num_tokens": 442838791.0, + "step": 11606 + }, + { + "epoch": 1.4765297036000509, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7842578887939453, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8656923770904541, + "num_tokens": 442882368.0, + "step": 11607 + }, + { + "epoch": 1.4766569138786414, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8425956964492798, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8652375936508179, + "num_tokens": 442923768.0, + "step": 11608 + }, + { + "epoch": 1.476784124157232, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9720914363861084, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.879875659942627, + "num_tokens": 442955656.0, + "step": 11609 + }, + { + "epoch": 1.4769113344358225, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7855995893478394, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8566380143165588, + "num_tokens": 442997323.0, + "step": 11610 + }, + { + "epoch": 1.477038544714413, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.930294394493103, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8514125943183899, + "num_tokens": 443033454.0, + "step": 11611 + }, + { + "epoch": 1.4771657549930035, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 16.59002685546875, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8648754954338074, + "num_tokens": 443077902.0, + "step": 11612 + }, + { + "epoch": 1.477292965271594, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.132042169570923, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8560844659805298, + "num_tokens": 443116515.0, + "step": 11613 + }, + { + "epoch": 1.4774201755501846, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9568729400634766, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8897486329078674, + "num_tokens": 443154124.0, + "step": 11614 + }, + { + "epoch": 1.477547385828775, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.910711646080017, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8735175728797913, + "num_tokens": 443195852.0, + "step": 11615 + }, + { + "epoch": 1.4776745961073654, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.025787591934204, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8565528988838196, + "num_tokens": 443229850.0, + "step": 11616 + }, + { + "epoch": 1.477801806385956, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8098392486572266, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8579652905464172, + "num_tokens": 443274481.0, + "step": 11617 + }, + { + "epoch": 1.4779290166645465, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9028173685073853, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8709282279014587, + "num_tokens": 443311737.0, + "step": 11618 + }, + { + "epoch": 1.478056226943137, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.042630672454834, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8579815626144409, + "num_tokens": 443345310.0, + "step": 11619 + }, + { + "epoch": 1.4781834372217275, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7779638767242432, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.862953245639801, + "num_tokens": 443392869.0, + "step": 11620 + }, + { + "epoch": 1.478310647500318, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9610296487808228, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8621514439582825, + "num_tokens": 443432842.0, + "step": 11621 + }, + { + "epoch": 1.4784378577789086, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8970164060592651, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8659798502922058, + "num_tokens": 443465954.0, + "step": 11622 + }, + { + "epoch": 1.478565068057499, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7461400032043457, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8554480075836182, + "num_tokens": 443508506.0, + "step": 11623 + }, + { + "epoch": 1.4786922783360896, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0467731952667236, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8624995350837708, + "num_tokens": 443543575.0, + "step": 11624 + }, + { + "epoch": 1.47881948861468, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.0101284980773926, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8674798607826233, + "num_tokens": 443578504.0, + "step": 11625 + }, + { + "epoch": 1.4789466988932705, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9567278623580933, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.877531886100769, + "num_tokens": 443614647.0, + "step": 11626 + }, + { + "epoch": 1.479073909171861, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.004173994064331, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.859520673751831, + "num_tokens": 443649244.0, + "step": 11627 + }, + { + "epoch": 1.4792011194504515, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9052239656448364, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8613582849502563, + "num_tokens": 443688374.0, + "step": 11628 + }, + { + "epoch": 1.479328329729042, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9241442680358887, + "learning_rate": 1e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8343948721885681, + "num_tokens": 443730929.0, + "step": 11629 + }, + { + "epoch": 1.4794555400076326, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9977349042892456, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8573474884033203, + "num_tokens": 443763067.0, + "step": 11630 + }, + { + "epoch": 1.479582750286223, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9612351655960083, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8523203730583191, + "num_tokens": 443801273.0, + "step": 11631 + }, + { + "epoch": 1.4797099605648136, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.946283221244812, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8521672487258911, + "num_tokens": 443840394.0, + "step": 11632 + }, + { + "epoch": 1.4798371708434042, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0244908332824707, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8601617813110352, + "num_tokens": 443877486.0, + "step": 11633 + }, + { + "epoch": 1.4799643811219947, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8267579078674316, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8639170527458191, + "num_tokens": 443913763.0, + "step": 11634 + }, + { + "epoch": 1.4800915914005852, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0033791065216064, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8728005886077881, + "num_tokens": 443946840.0, + "step": 11635 + }, + { + "epoch": 1.4802188016791757, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7608658075332642, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8495500087738037, + "num_tokens": 443992439.0, + "step": 11636 + }, + { + "epoch": 1.4803460119577663, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9568856954574585, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8740253448486328, + "num_tokens": 444029348.0, + "step": 11637 + }, + { + "epoch": 1.4804732222363568, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8710427284240723, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8733269572257996, + "num_tokens": 444068386.0, + "step": 11638 + }, + { + "epoch": 1.4806004325149473, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.955132007598877, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8518180251121521, + "num_tokens": 444110307.0, + "step": 11639 + }, + { + "epoch": 1.4807276427935376, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7957037687301636, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8637813329696655, + "num_tokens": 444153919.0, + "step": 11640 + }, + { + "epoch": 1.4808548530721282, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8469640016555786, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8664464354515076, + "num_tokens": 444190575.0, + "step": 11641 + }, + { + "epoch": 1.4809820633507187, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1135942935943604, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8579409122467041, + "num_tokens": 444226108.0, + "step": 11642 + }, + { + "epoch": 1.4811092736293092, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2864608764648438, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.861400842666626, + "num_tokens": 444268162.0, + "step": 11643 + }, + { + "epoch": 1.4812364839078997, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8861188888549805, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8504257202148438, + "num_tokens": 444309273.0, + "step": 11644 + }, + { + "epoch": 1.4813636941864903, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0382049083709717, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8681454658508301, + "num_tokens": 444338697.0, + "step": 11645 + }, + { + "epoch": 1.4814909044650808, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9856280088424683, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8733789920806885, + "num_tokens": 444377954.0, + "step": 11646 + }, + { + "epoch": 1.4816181147436713, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8251756429672241, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.864181399345398, + "num_tokens": 444416996.0, + "step": 11647 + }, + { + "epoch": 1.4817453250222619, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.928220510482788, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8707629442214966, + "num_tokens": 444455550.0, + "step": 11648 + }, + { + "epoch": 1.4818725353008524, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.949469804763794, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.859374463558197, + "num_tokens": 444493979.0, + "step": 11649 + }, + { + "epoch": 1.4819997455794427, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8450326919555664, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8617037534713745, + "num_tokens": 444540622.0, + "step": 11650 + }, + { + "epoch": 1.4821269558580332, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2468149662017822, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.86119544506073, + "num_tokens": 444577772.0, + "step": 11651 + }, + { + "epoch": 1.4822541661366238, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1294305324554443, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8517262935638428, + "num_tokens": 444615523.0, + "step": 11652 + }, + { + "epoch": 1.4823813764152143, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9790045022964478, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8629980087280273, + "num_tokens": 444659785.0, + "step": 11653 + }, + { + "epoch": 1.4825085866938048, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4524261951446533, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8598055839538574, + "num_tokens": 444700806.0, + "step": 11654 + }, + { + "epoch": 1.4826357969723953, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9646358489990234, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8649362325668335, + "num_tokens": 444734320.0, + "step": 11655 + }, + { + "epoch": 1.4827630072509859, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0025196075439453, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8576611280441284, + "num_tokens": 444770215.0, + "step": 11656 + }, + { + "epoch": 1.4828902175295764, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0257136821746826, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8624239563941956, + "num_tokens": 444805432.0, + "step": 11657 + }, + { + "epoch": 1.483017427808167, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1381499767303467, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.862432599067688, + "num_tokens": 444832398.0, + "step": 11658 + }, + { + "epoch": 1.4831446380867574, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7194864749908447, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.867972731590271, + "num_tokens": 444876128.0, + "step": 11659 + }, + { + "epoch": 1.483271848365348, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8726611137390137, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8582907915115356, + "num_tokens": 444915188.0, + "step": 11660 + }, + { + "epoch": 1.4833990586439385, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8506999015808105, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8680850267410278, + "num_tokens": 444953528.0, + "step": 11661 + }, + { + "epoch": 1.483526268922529, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7807186841964722, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8577355146408081, + "num_tokens": 444996469.0, + "step": 11662 + }, + { + "epoch": 1.4836534792011196, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6791059970855713, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8801369667053223, + "num_tokens": 445034298.0, + "step": 11663 + }, + { + "epoch": 1.48378068947971, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7760876417160034, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8628437519073486, + "num_tokens": 445079010.0, + "step": 11664 + }, + { + "epoch": 1.4839078997583004, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.802983283996582, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8766151666641235, + "num_tokens": 445116009.0, + "step": 11665 + }, + { + "epoch": 1.484035110036891, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7075128555297852, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.87814861536026, + "num_tokens": 445156313.0, + "step": 11666 + }, + { + "epoch": 1.4841623203154815, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.040483236312866, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8644904494285583, + "num_tokens": 445187424.0, + "step": 11667 + }, + { + "epoch": 1.484289530594072, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.746908187866211, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8643150329589844, + "num_tokens": 445233448.0, + "step": 11668 + }, + { + "epoch": 1.4844167408726625, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7594634294509888, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8739217519760132, + "num_tokens": 445273663.0, + "step": 11669 + }, + { + "epoch": 1.484543951151253, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8456307649612427, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8688523173332214, + "num_tokens": 445313790.0, + "step": 11670 + }, + { + "epoch": 1.4846711614298436, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8898963928222656, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8747191429138184, + "num_tokens": 445351818.0, + "step": 11671 + }, + { + "epoch": 1.484798371708434, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0532143115997314, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8399003744125366, + "num_tokens": 445391763.0, + "step": 11672 + }, + { + "epoch": 1.4849255819870246, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8873696327209473, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.857659101486206, + "num_tokens": 445426723.0, + "step": 11673 + }, + { + "epoch": 1.485052792265615, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.094531774520874, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.868095874786377, + "num_tokens": 445465635.0, + "step": 11674 + }, + { + "epoch": 1.4851800025442055, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8728567361831665, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8645051717758179, + "num_tokens": 445502819.0, + "step": 11675 + }, + { + "epoch": 1.485307212822796, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8125419616699219, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8746599555015564, + "num_tokens": 445541861.0, + "step": 11676 + }, + { + "epoch": 1.4854344231013865, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9218207597732544, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8787867426872253, + "num_tokens": 445578058.0, + "step": 11677 + }, + { + "epoch": 1.485561633379977, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0111405849456787, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8504184484481812, + "num_tokens": 445612896.0, + "step": 11678 + }, + { + "epoch": 1.4856888436585676, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.84403657913208, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8570656776428223, + "num_tokens": 445647849.0, + "step": 11679 + }, + { + "epoch": 1.485816053937158, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2602152824401855, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.849145770072937, + "num_tokens": 445689199.0, + "step": 11680 + }, + { + "epoch": 1.4859432642157486, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9506629705429077, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8629482984542847, + "num_tokens": 445727869.0, + "step": 11681 + }, + { + "epoch": 1.4860704744943392, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8623346090316772, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.868345320224762, + "num_tokens": 445770105.0, + "step": 11682 + }, + { + "epoch": 1.4861976847729297, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.922795057296753, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8438605666160583, + "num_tokens": 445812950.0, + "step": 11683 + }, + { + "epoch": 1.4863248950515202, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9180285930633545, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8620325922966003, + "num_tokens": 445850811.0, + "step": 11684 + }, + { + "epoch": 1.4864521053301107, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9359149932861328, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8668352365493774, + "num_tokens": 445891967.0, + "step": 11685 + }, + { + "epoch": 1.4865793156087013, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8794915676116943, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8662157654762268, + "num_tokens": 445927936.0, + "step": 11686 + }, + { + "epoch": 1.4867065258872918, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8707189559936523, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8768472671508789, + "num_tokens": 445962481.0, + "step": 11687 + }, + { + "epoch": 1.4868337361658823, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8276398181915283, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8777750730514526, + "num_tokens": 446001445.0, + "step": 11688 + }, + { + "epoch": 1.4869609464444726, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1255533695220947, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8846776485443115, + "num_tokens": 446037904.0, + "step": 11689 + }, + { + "epoch": 1.4870881567230632, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0642669200897217, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8737213611602783, + "num_tokens": 446077583.0, + "step": 11690 + }, + { + "epoch": 1.4872153670016537, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1248459815979004, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8615137338638306, + "num_tokens": 446110123.0, + "step": 11691 + }, + { + "epoch": 1.4873425772802442, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8101959228515625, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.862969160079956, + "num_tokens": 446149832.0, + "step": 11692 + }, + { + "epoch": 1.4874697875588347, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1515512466430664, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8715564608573914, + "num_tokens": 446184211.0, + "step": 11693 + }, + { + "epoch": 1.4875969978374253, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7769204378128052, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8577251434326172, + "num_tokens": 446229620.0, + "step": 11694 + }, + { + "epoch": 1.4877242081160158, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0006134510040283, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8658105134963989, + "num_tokens": 446267134.0, + "step": 11695 + }, + { + "epoch": 1.4878514183946063, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0675699710845947, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8509422540664673, + "num_tokens": 446303001.0, + "step": 11696 + }, + { + "epoch": 1.4879786286731969, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9812880754470825, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8628977537155151, + "num_tokens": 446334286.0, + "step": 11697 + }, + { + "epoch": 1.4881058389517874, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9018131494522095, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8825422525405884, + "num_tokens": 446369547.0, + "step": 11698 + }, + { + "epoch": 1.4882330492303777, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.880274772644043, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8493442535400391, + "num_tokens": 446408179.0, + "step": 11699 + }, + { + "epoch": 1.4883602595089682, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.226869583129883, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8669295310974121, + "num_tokens": 446448533.0, + "step": 11700 + }, + { + "epoch": 1.4884874697875587, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9937807321548462, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.862366795539856, + "num_tokens": 446489445.0, + "step": 11701 + }, + { + "epoch": 1.4886146800661493, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9320627450942993, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8776086568832397, + "num_tokens": 446526530.0, + "step": 11702 + }, + { + "epoch": 1.4887418903447398, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 4.400259017944336, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8647643327713013, + "num_tokens": 446558315.0, + "step": 11703 + }, + { + "epoch": 1.4888691006233303, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9007480144500732, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8680555820465088, + "num_tokens": 446598548.0, + "step": 11704 + }, + { + "epoch": 1.4889963109019209, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.036837100982666, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8606163263320923, + "num_tokens": 446632111.0, + "step": 11705 + }, + { + "epoch": 1.4891235211805114, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8697657585144043, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8725204467773438, + "num_tokens": 446669296.0, + "step": 11706 + }, + { + "epoch": 1.489250731459102, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8136038780212402, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8671469688415527, + "num_tokens": 446707891.0, + "step": 11707 + }, + { + "epoch": 1.4893779417376924, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9610644578933716, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8530685901641846, + "num_tokens": 446749455.0, + "step": 11708 + }, + { + "epoch": 1.489505152016283, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.144355297088623, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8773945569992065, + "num_tokens": 446787001.0, + "step": 11709 + }, + { + "epoch": 1.4896323622948735, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8915369510650635, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.862092137336731, + "num_tokens": 446827262.0, + "step": 11710 + }, + { + "epoch": 1.489759572573464, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0388636589050293, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8792956471443176, + "num_tokens": 446860514.0, + "step": 11711 + }, + { + "epoch": 1.4898867828520546, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8990416526794434, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8554603457450867, + "num_tokens": 446902605.0, + "step": 11712 + }, + { + "epoch": 1.490013993130645, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9598824977874756, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8706263899803162, + "num_tokens": 446938638.0, + "step": 11713 + }, + { + "epoch": 1.4901412034092354, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8613206148147583, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8696925640106201, + "num_tokens": 446979286.0, + "step": 11714 + }, + { + "epoch": 1.490268413687826, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1128628253936768, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.848468542098999, + "num_tokens": 447017405.0, + "step": 11715 + }, + { + "epoch": 1.4903956239664164, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8188039064407349, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8663682341575623, + "num_tokens": 447056733.0, + "step": 11716 + }, + { + "epoch": 1.490522834245007, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8701781034469604, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8585854172706604, + "num_tokens": 447094493.0, + "step": 11717 + }, + { + "epoch": 1.4906500445235975, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.866808533668518, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8795244693756104, + "num_tokens": 447134039.0, + "step": 11718 + }, + { + "epoch": 1.490777254802188, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9288585186004639, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8720585107803345, + "num_tokens": 447168320.0, + "step": 11719 + }, + { + "epoch": 1.4909044650807786, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0068893432617188, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8495755195617676, + "num_tokens": 447205640.0, + "step": 11720 + }, + { + "epoch": 1.491031675359369, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.053893804550171, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8781753778457642, + "num_tokens": 447239831.0, + "step": 11721 + }, + { + "epoch": 1.4911588856379596, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8483359813690186, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8501861095428467, + "num_tokens": 447281100.0, + "step": 11722 + }, + { + "epoch": 1.49128609591655, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0780749320983887, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8452132940292358, + "num_tokens": 447316137.0, + "step": 11723 + }, + { + "epoch": 1.4914133061951405, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9412727355957031, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8568816781044006, + "num_tokens": 447357922.0, + "step": 11724 + }, + { + "epoch": 1.491540516473731, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8819661140441895, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.861777663230896, + "num_tokens": 447394905.0, + "step": 11725 + }, + { + "epoch": 1.4916677267523215, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8294172286987305, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8758705854415894, + "num_tokens": 447435379.0, + "step": 11726 + }, + { + "epoch": 1.491794937030912, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.813641905784607, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8562486171722412, + "num_tokens": 447473058.0, + "step": 11727 + }, + { + "epoch": 1.4919221473095026, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9060020446777344, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8740774393081665, + "num_tokens": 447510966.0, + "step": 11728 + }, + { + "epoch": 1.492049357588093, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7319234609603882, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8670668601989746, + "num_tokens": 447554747.0, + "step": 11729 + }, + { + "epoch": 1.4921765678666836, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.957681655883789, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.867172122001648, + "num_tokens": 447594426.0, + "step": 11730 + }, + { + "epoch": 1.4923037781452742, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8265595436096191, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8573925495147705, + "num_tokens": 447634786.0, + "step": 11731 + }, + { + "epoch": 1.4924309884238647, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9074609279632568, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8618370890617371, + "num_tokens": 447668682.0, + "step": 11732 + }, + { + "epoch": 1.4925581987024552, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.94627046585083, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8543962836265564, + "num_tokens": 447705566.0, + "step": 11733 + }, + { + "epoch": 1.4926854089810457, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.938719391822815, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8612704873085022, + "num_tokens": 447746385.0, + "step": 11734 + }, + { + "epoch": 1.4928126192596363, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7376281023025513, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8687527775764465, + "num_tokens": 447790483.0, + "step": 11735 + }, + { + "epoch": 1.4929398295382268, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1863903999328613, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8624217510223389, + "num_tokens": 447826872.0, + "step": 11736 + }, + { + "epoch": 1.4930670398168173, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.916697382926941, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8693145513534546, + "num_tokens": 447864071.0, + "step": 11737 + }, + { + "epoch": 1.4931942500954076, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9764364957809448, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8464363217353821, + "num_tokens": 447903320.0, + "step": 11738 + }, + { + "epoch": 1.4933214603739982, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.818656325340271, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8782229423522949, + "num_tokens": 447940253.0, + "step": 11739 + }, + { + "epoch": 1.4934486706525887, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.97318696975708, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8696704506874084, + "num_tokens": 447978277.0, + "step": 11740 + }, + { + "epoch": 1.4935758809311792, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8966346979141235, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.853459894657135, + "num_tokens": 448018798.0, + "step": 11741 + }, + { + "epoch": 1.4937030912097697, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.915549397468567, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.853801965713501, + "num_tokens": 448057469.0, + "step": 11742 + }, + { + "epoch": 1.4938303014883603, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0454659461975098, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8513661623001099, + "num_tokens": 448089032.0, + "step": 11743 + }, + { + "epoch": 1.4939575117669508, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.062122344970703, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8596242070198059, + "num_tokens": 448122438.0, + "step": 11744 + }, + { + "epoch": 1.4940847220455413, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8827786445617676, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8636969327926636, + "num_tokens": 448166844.0, + "step": 11745 + }, + { + "epoch": 1.4942119323241319, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1562886238098145, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8545893430709839, + "num_tokens": 448197766.0, + "step": 11746 + }, + { + "epoch": 1.4943391426027224, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.898033618927002, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8518836498260498, + "num_tokens": 448240850.0, + "step": 11747 + }, + { + "epoch": 1.4944663528813127, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.959817886352539, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.874118447303772, + "num_tokens": 448274187.0, + "step": 11748 + }, + { + "epoch": 1.4945935631599032, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.895592451095581, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8699941039085388, + "num_tokens": 448311069.0, + "step": 11749 + }, + { + "epoch": 1.4947207734384937, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7277882099151611, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8636936545372009, + "num_tokens": 448351536.0, + "step": 11750 + }, + { + "epoch": 1.4948479837170843, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9802632331848145, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8612794280052185, + "num_tokens": 448385108.0, + "step": 11751 + }, + { + "epoch": 1.4949751939956748, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.825462818145752, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8745429515838623, + "num_tokens": 448423272.0, + "step": 11752 + }, + { + "epoch": 1.4951024042742653, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 3.9686901569366455, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8683099746704102, + "num_tokens": 448462904.0, + "step": 11753 + }, + { + "epoch": 1.4952296145528559, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0637528896331787, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8610533475875854, + "num_tokens": 448503257.0, + "step": 11754 + }, + { + "epoch": 1.4953568248314464, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9366106986999512, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8693805932998657, + "num_tokens": 448535299.0, + "step": 11755 + }, + { + "epoch": 1.495484035110037, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9406639337539673, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8710178136825562, + "num_tokens": 448568878.0, + "step": 11756 + }, + { + "epoch": 1.4956112453886274, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.866927146911621, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8821432590484619, + "num_tokens": 448604959.0, + "step": 11757 + }, + { + "epoch": 1.495738455667218, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.081986665725708, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8761175870895386, + "num_tokens": 448638487.0, + "step": 11758 + }, + { + "epoch": 1.4958656659458085, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.787235140800476, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.870334267616272, + "num_tokens": 448677212.0, + "step": 11759 + }, + { + "epoch": 1.495992876224399, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0012338161468506, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.858746349811554, + "num_tokens": 448719654.0, + "step": 11760 + }, + { + "epoch": 1.4961200865029896, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8893080949783325, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8679090142250061, + "num_tokens": 448760783.0, + "step": 11761 + }, + { + "epoch": 1.49624729678158, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.906935691833496, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8536714315414429, + "num_tokens": 448802843.0, + "step": 11762 + }, + { + "epoch": 1.4963745070601704, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.077873945236206, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8615036010742188, + "num_tokens": 448838430.0, + "step": 11763 + }, + { + "epoch": 1.496501717338761, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8551536798477173, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8554834723472595, + "num_tokens": 448880450.0, + "step": 11764 + }, + { + "epoch": 1.4966289276173514, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0042459964752197, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8655199408531189, + "num_tokens": 448919352.0, + "step": 11765 + }, + { + "epoch": 1.496756137895942, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7630231380462646, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8395017385482788, + "num_tokens": 448965402.0, + "step": 11766 + }, + { + "epoch": 1.4968833481745325, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1233699321746826, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8754847049713135, + "num_tokens": 448996983.0, + "step": 11767 + }, + { + "epoch": 1.497010558453123, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.011275053024292, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8505391478538513, + "num_tokens": 449036555.0, + "step": 11768 + }, + { + "epoch": 1.4971377687317136, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9839463233947754, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8759778738021851, + "num_tokens": 449070545.0, + "step": 11769 + }, + { + "epoch": 1.497264979010304, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9266639947891235, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.848809003829956, + "num_tokens": 449108859.0, + "step": 11770 + }, + { + "epoch": 1.4973921892888946, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9150292873382568, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8622006773948669, + "num_tokens": 449145333.0, + "step": 11771 + }, + { + "epoch": 1.497519399567485, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8761365413665771, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8582659959793091, + "num_tokens": 449181292.0, + "step": 11772 + }, + { + "epoch": 1.4976466098460754, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.632808685302734, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8673644661903381, + "num_tokens": 449213940.0, + "step": 11773 + }, + { + "epoch": 1.497773820124666, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2149317264556885, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.876541256904602, + "num_tokens": 449252487.0, + "step": 11774 + }, + { + "epoch": 1.4979010304032565, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.033280611038208, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8568954467773438, + "num_tokens": 449287268.0, + "step": 11775 + }, + { + "epoch": 1.498028240681847, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.988925814628601, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8602085113525391, + "num_tokens": 449326975.0, + "step": 11776 + }, + { + "epoch": 1.4981554509604376, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8571110963821411, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8633619546890259, + "num_tokens": 449366061.0, + "step": 11777 + }, + { + "epoch": 1.498282661239028, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8663349151611328, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8725625276565552, + "num_tokens": 449401763.0, + "step": 11778 + }, + { + "epoch": 1.4984098715176186, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.829113245010376, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.883562445640564, + "num_tokens": 449436605.0, + "step": 11779 + }, + { + "epoch": 1.4985370817962091, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0312159061431885, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8590267300605774, + "num_tokens": 449474523.0, + "step": 11780 + }, + { + "epoch": 1.4986642920747997, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.791803240776062, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8762168884277344, + "num_tokens": 449513301.0, + "step": 11781 + }, + { + "epoch": 1.4987915023533902, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9130412340164185, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8600847125053406, + "num_tokens": 449547826.0, + "step": 11782 + }, + { + "epoch": 1.4989187126319807, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.948791742324829, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.863013744354248, + "num_tokens": 449584818.0, + "step": 11783 + }, + { + "epoch": 1.4990459229105713, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8322371244430542, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8456542491912842, + "num_tokens": 449625368.0, + "step": 11784 + }, + { + "epoch": 1.4991731331891618, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9153763055801392, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8503952622413635, + "num_tokens": 449666320.0, + "step": 11785 + }, + { + "epoch": 1.4993003434677523, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9489960670471191, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8569989204406738, + "num_tokens": 449705993.0, + "step": 11786 + }, + { + "epoch": 1.4994275537463426, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9301930665969849, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8581186532974243, + "num_tokens": 449747088.0, + "step": 11787 + }, + { + "epoch": 1.4995547640249332, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7420462369918823, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8657361268997192, + "num_tokens": 449789840.0, + "step": 11788 + }, + { + "epoch": 1.4996819743035237, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8794658184051514, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8631808757781982, + "num_tokens": 449830599.0, + "step": 11789 + }, + { + "epoch": 1.4998091845821142, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0809004306793213, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8681431412696838, + "num_tokens": 449864476.0, + "step": 11790 + }, + { + "epoch": 1.4999363948607047, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.14453387260437, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8467057943344116, + "num_tokens": 449903791.0, + "step": 11791 + }, + { + "epoch": 1.5000636051392953, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8357219696044922, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8631566762924194, + "num_tokens": 449945478.0, + "step": 11792 + }, + { + "epoch": 1.5001908154178858, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1193723678588867, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8608515858650208, + "num_tokens": 449986175.0, + "step": 11793 + }, + { + "epoch": 1.5003180256964763, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9164795875549316, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8602378368377686, + "num_tokens": 450025537.0, + "step": 11794 + }, + { + "epoch": 1.5004452359750666, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9704102277755737, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8502436280250549, + "num_tokens": 450068269.0, + "step": 11795 + }, + { + "epoch": 1.5005724462536572, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.417292356491089, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8807363510131836, + "num_tokens": 450106234.0, + "step": 11796 + }, + { + "epoch": 1.5006996565322477, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0058789253234863, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8740068078041077, + "num_tokens": 450141916.0, + "step": 11797 + }, + { + "epoch": 1.5008268668108382, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.099057197570801, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8492465615272522, + "num_tokens": 450175427.0, + "step": 11798 + }, + { + "epoch": 1.5009540770894287, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9336504936218262, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8659705519676208, + "num_tokens": 450211104.0, + "step": 11799 + }, + { + "epoch": 1.5010812873680193, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9896656274795532, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.855772852897644, + "num_tokens": 450244844.0, + "step": 11800 + }, + { + "epoch": 1.5012084976466098, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8729451894760132, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8618149757385254, + "num_tokens": 450286094.0, + "step": 11801 + }, + { + "epoch": 1.5013357079252003, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0613348484039307, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8544931411743164, + "num_tokens": 450324938.0, + "step": 11802 + }, + { + "epoch": 1.5014629182037909, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7813029289245605, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8634195327758789, + "num_tokens": 450368572.0, + "step": 11803 + }, + { + "epoch": 1.5015901284823814, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8760000467300415, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8420451283454895, + "num_tokens": 450410926.0, + "step": 11804 + }, + { + "epoch": 1.501717338760972, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9557595252990723, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8680422306060791, + "num_tokens": 450450206.0, + "step": 11805 + }, + { + "epoch": 1.5018445490395624, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9618383646011353, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.843127965927124, + "num_tokens": 450488998.0, + "step": 11806 + }, + { + "epoch": 1.501971759318153, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8094829320907593, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8744776248931885, + "num_tokens": 450530476.0, + "step": 11807 + }, + { + "epoch": 1.5020989695967435, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.122429847717285, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8708542585372925, + "num_tokens": 450560655.0, + "step": 11808 + }, + { + "epoch": 1.502226179875334, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0799896717071533, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8666344881057739, + "num_tokens": 450601816.0, + "step": 11809 + }, + { + "epoch": 1.5023533901539246, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9965506792068481, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8664900064468384, + "num_tokens": 450634182.0, + "step": 11810 + }, + { + "epoch": 1.502480600432515, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8760372400283813, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8600906133651733, + "num_tokens": 450672705.0, + "step": 11811 + }, + { + "epoch": 1.5026078107111056, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.119412899017334, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8514660000801086, + "num_tokens": 450705636.0, + "step": 11812 + }, + { + "epoch": 1.502735020989696, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.219020128250122, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8562725782394409, + "num_tokens": 450736165.0, + "step": 11813 + }, + { + "epoch": 1.5028622312682864, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9331693649291992, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8648554682731628, + "num_tokens": 450776939.0, + "step": 11814 + }, + { + "epoch": 1.502989441546877, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9738177061080933, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.867567777633667, + "num_tokens": 450811994.0, + "step": 11815 + }, + { + "epoch": 1.5031166518254675, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8470613956451416, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8653159737586975, + "num_tokens": 450853688.0, + "step": 11816 + }, + { + "epoch": 1.503243862104058, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9220240116119385, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8632712364196777, + "num_tokens": 450892100.0, + "step": 11817 + }, + { + "epoch": 1.5033710723826486, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7796094417572021, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8799259662628174, + "num_tokens": 450936383.0, + "step": 11818 + }, + { + "epoch": 1.503498282661239, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9771738052368164, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8700546026229858, + "num_tokens": 450965525.0, + "step": 11819 + }, + { + "epoch": 1.5036254929398294, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8285877704620361, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8523062467575073, + "num_tokens": 451005874.0, + "step": 11820 + }, + { + "epoch": 1.50375270321842, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.053297519683838, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8455369472503662, + "num_tokens": 451040668.0, + "step": 11821 + }, + { + "epoch": 1.5038799134970104, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8253523111343384, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.860774576663971, + "num_tokens": 451082747.0, + "step": 11822 + }, + { + "epoch": 1.504007123775601, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.074406623840332, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8709462285041809, + "num_tokens": 451120344.0, + "step": 11823 + }, + { + "epoch": 1.5041343340541915, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9064849615097046, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8553981781005859, + "num_tokens": 451156564.0, + "step": 11824 + }, + { + "epoch": 1.504261544332782, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8458000421524048, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8586980104446411, + "num_tokens": 451197252.0, + "step": 11825 + }, + { + "epoch": 1.5043887546113726, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8954025506973267, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8523437976837158, + "num_tokens": 451236150.0, + "step": 11826 + }, + { + "epoch": 1.504515964889963, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.923977017402649, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8592395186424255, + "num_tokens": 451275751.0, + "step": 11827 + }, + { + "epoch": 1.5046431751685536, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8498492240905762, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8831180334091187, + "num_tokens": 451316778.0, + "step": 11828 + }, + { + "epoch": 1.5047703854471441, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8408498764038086, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8629776239395142, + "num_tokens": 451359921.0, + "step": 11829 + }, + { + "epoch": 1.5048975957257347, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9513357877731323, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8692978620529175, + "num_tokens": 451395616.0, + "step": 11830 + }, + { + "epoch": 1.5050248060043252, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9281890392303467, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8645782470703125, + "num_tokens": 451427766.0, + "step": 11831 + }, + { + "epoch": 1.5051520162829157, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.957510232925415, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8715723752975464, + "num_tokens": 451463073.0, + "step": 11832 + }, + { + "epoch": 1.5052792265615063, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8837971687316895, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8823431730270386, + "num_tokens": 451503039.0, + "step": 11833 + }, + { + "epoch": 1.5054064368400968, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.058361530303955, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8466635942459106, + "num_tokens": 451541599.0, + "step": 11834 + }, + { + "epoch": 1.5055336471186873, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9322669506072998, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.849776029586792, + "num_tokens": 451579278.0, + "step": 11835 + }, + { + "epoch": 1.5056608573972778, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9177145957946777, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8666284084320068, + "num_tokens": 451618591.0, + "step": 11836 + }, + { + "epoch": 1.5057880676758684, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9644490480422974, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8631197810173035, + "num_tokens": 451654453.0, + "step": 11837 + }, + { + "epoch": 1.5059152779544587, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0877225399017334, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8622228503227234, + "num_tokens": 451688302.0, + "step": 11838 + }, + { + "epoch": 1.5060424882330492, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8583735227584839, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8696422576904297, + "num_tokens": 451727301.0, + "step": 11839 + }, + { + "epoch": 1.5061696985116397, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8412377834320068, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8501632213592529, + "num_tokens": 451770471.0, + "step": 11840 + }, + { + "epoch": 1.5062969087902303, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9079316854476929, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.851000189781189, + "num_tokens": 451809413.0, + "step": 11841 + }, + { + "epoch": 1.5064241190688208, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9200806617736816, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8610942363739014, + "num_tokens": 451850177.0, + "step": 11842 + }, + { + "epoch": 1.5065513293474113, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9567919969558716, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8521982431411743, + "num_tokens": 451887977.0, + "step": 11843 + }, + { + "epoch": 1.5066785396260016, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8383606672286987, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8734241724014282, + "num_tokens": 451927403.0, + "step": 11844 + }, + { + "epoch": 1.5068057499045922, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8915921449661255, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8640985488891602, + "num_tokens": 451967350.0, + "step": 11845 + }, + { + "epoch": 1.5069329601831827, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 80.5181655883789, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8632829785346985, + "num_tokens": 452008664.0, + "step": 11846 + }, + { + "epoch": 1.5070601704617732, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9323062896728516, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8685923218727112, + "num_tokens": 452047303.0, + "step": 11847 + }, + { + "epoch": 1.5071873807403637, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0489065647125244, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8638200759887695, + "num_tokens": 452087359.0, + "step": 11848 + }, + { + "epoch": 1.5073145910189543, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 3.051203489303589, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.849044919013977, + "num_tokens": 452129426.0, + "step": 11849 + }, + { + "epoch": 1.5074418012975448, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9540599584579468, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.867662787437439, + "num_tokens": 452169134.0, + "step": 11850 + }, + { + "epoch": 1.5075690115761353, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8272638320922852, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8543325662612915, + "num_tokens": 452209962.0, + "step": 11851 + }, + { + "epoch": 1.5076962218547258, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.837769627571106, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8669471740722656, + "num_tokens": 452245978.0, + "step": 11852 + }, + { + "epoch": 1.5078234321333164, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8936623334884644, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8725016117095947, + "num_tokens": 452281013.0, + "step": 11853 + }, + { + "epoch": 1.507950642411907, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8935425281524658, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8766106963157654, + "num_tokens": 452317174.0, + "step": 11854 + }, + { + "epoch": 1.5080778526904974, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.798962950706482, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8593709468841553, + "num_tokens": 452362974.0, + "step": 11855 + }, + { + "epoch": 1.508205062969088, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9404621124267578, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8743190765380859, + "num_tokens": 452400701.0, + "step": 11856 + }, + { + "epoch": 1.5083322732476785, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 3.067422389984131, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8725628852844238, + "num_tokens": 452438541.0, + "step": 11857 + }, + { + "epoch": 1.508459483526269, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.996681809425354, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8582671880722046, + "num_tokens": 452476918.0, + "step": 11858 + }, + { + "epoch": 1.5085866938048595, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.946070909500122, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8675448298454285, + "num_tokens": 452509231.0, + "step": 11859 + }, + { + "epoch": 1.50871390408345, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9617664813995361, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.85892653465271, + "num_tokens": 452547079.0, + "step": 11860 + }, + { + "epoch": 1.5088411143620406, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.957148790359497, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8749521970748901, + "num_tokens": 452580215.0, + "step": 11861 + }, + { + "epoch": 1.508968324640631, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9451848268508911, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8699357509613037, + "num_tokens": 452615778.0, + "step": 11862 + }, + { + "epoch": 1.5090955349192214, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8767458200454712, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8658188581466675, + "num_tokens": 452652184.0, + "step": 11863 + }, + { + "epoch": 1.509222745197812, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8389538526535034, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.850185215473175, + "num_tokens": 452691345.0, + "step": 11864 + }, + { + "epoch": 1.5093499554764025, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8479914665222168, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8652710914611816, + "num_tokens": 452729982.0, + "step": 11865 + }, + { + "epoch": 1.509477165754993, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8320107460021973, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.861427903175354, + "num_tokens": 452771272.0, + "step": 11866 + }, + { + "epoch": 1.5096043760335836, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.852026104927063, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8592313528060913, + "num_tokens": 452812231.0, + "step": 11867 + }, + { + "epoch": 1.509731586312174, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8076057434082031, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8531063795089722, + "num_tokens": 452858406.0, + "step": 11868 + }, + { + "epoch": 1.5098587965907644, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7678707838058472, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.85490882396698, + "num_tokens": 452898793.0, + "step": 11869 + }, + { + "epoch": 1.509986006869355, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.814048409461975, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8695133924484253, + "num_tokens": 452937929.0, + "step": 11870 + }, + { + "epoch": 1.5101132171479454, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9153976440429688, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8669042587280273, + "num_tokens": 452975436.0, + "step": 11871 + }, + { + "epoch": 1.510240427426536, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.20382022857666, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8636246919631958, + "num_tokens": 453016334.0, + "step": 11872 + }, + { + "epoch": 1.5103676377051265, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8520960807800293, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8672266006469727, + "num_tokens": 453053630.0, + "step": 11873 + }, + { + "epoch": 1.510494847983717, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8998667001724243, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8445183634757996, + "num_tokens": 453092341.0, + "step": 11874 + }, + { + "epoch": 1.5106220582623076, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.048642158508301, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8721684217453003, + "num_tokens": 453126859.0, + "step": 11875 + }, + { + "epoch": 1.510749268540898, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9152264595031738, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8604088425636292, + "num_tokens": 453167327.0, + "step": 11876 + }, + { + "epoch": 1.5108764788194886, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0596227645874023, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8693633079528809, + "num_tokens": 453205901.0, + "step": 11877 + }, + { + "epoch": 1.5110036890980791, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.029148578643799, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8620656728744507, + "num_tokens": 453241856.0, + "step": 11878 + }, + { + "epoch": 1.5111308993766697, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7616691589355469, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8715267181396484, + "num_tokens": 453282528.0, + "step": 11879 + }, + { + "epoch": 1.5112581096552602, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9190653562545776, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8532819747924805, + "num_tokens": 453318762.0, + "step": 11880 + }, + { + "epoch": 1.5113853199338507, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9058846235275269, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8784581422805786, + "num_tokens": 453352482.0, + "step": 11881 + }, + { + "epoch": 1.5115125302124413, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8296098709106445, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8624979257583618, + "num_tokens": 453393692.0, + "step": 11882 + }, + { + "epoch": 1.5116397404910318, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9717695713043213, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8666393160820007, + "num_tokens": 453429217.0, + "step": 11883 + }, + { + "epoch": 1.5117669507696223, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9318122863769531, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8484108448028564, + "num_tokens": 453468190.0, + "step": 11884 + }, + { + "epoch": 1.5118941610482128, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.125248670578003, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8567311763763428, + "num_tokens": 453509893.0, + "step": 11885 + }, + { + "epoch": 1.5120213713268034, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8441716432571411, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8671579360961914, + "num_tokens": 453546972.0, + "step": 11886 + }, + { + "epoch": 1.5121485816053937, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0048141479492188, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8749867677688599, + "num_tokens": 453578480.0, + "step": 11887 + }, + { + "epoch": 1.5122757918839842, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.799356460571289, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8640753626823425, + "num_tokens": 453623248.0, + "step": 11888 + }, + { + "epoch": 1.5124030021625747, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1386559009552, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8839664459228516, + "num_tokens": 453660088.0, + "step": 11889 + }, + { + "epoch": 1.5125302124411653, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8665851354599, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8663376569747925, + "num_tokens": 453698931.0, + "step": 11890 + }, + { + "epoch": 1.5126574227197558, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9281774759292603, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.867167055606842, + "num_tokens": 453734220.0, + "step": 11891 + }, + { + "epoch": 1.5127846329983463, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9073772430419922, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8634834289550781, + "num_tokens": 453774056.0, + "step": 11892 + }, + { + "epoch": 1.5129118432769366, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9843302965164185, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8698293566703796, + "num_tokens": 453806616.0, + "step": 11893 + }, + { + "epoch": 1.5130390535555271, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8659718036651611, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.871726393699646, + "num_tokens": 453843838.0, + "step": 11894 + }, + { + "epoch": 1.5131662638341177, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9699925184249878, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8715662956237793, + "num_tokens": 453876888.0, + "step": 11895 + }, + { + "epoch": 1.5132934741127082, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0479469299316406, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.879001796245575, + "num_tokens": 453915334.0, + "step": 11896 + }, + { + "epoch": 1.5134206843912987, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9546594619750977, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8734374642372131, + "num_tokens": 453950590.0, + "step": 11897 + }, + { + "epoch": 1.5135478946698893, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7540249824523926, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8676788806915283, + "num_tokens": 453992590.0, + "step": 11898 + }, + { + "epoch": 1.5136751049484798, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7771073579788208, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8685734868049622, + "num_tokens": 454038619.0, + "step": 11899 + }, + { + "epoch": 1.5138023152270703, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.833204746246338, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.853810727596283, + "num_tokens": 454081564.0, + "step": 11900 + }, + { + "epoch": 1.5139295255056608, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8414705991744995, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8519119024276733, + "num_tokens": 454125581.0, + "step": 11901 + }, + { + "epoch": 1.5140567357842514, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7753626108169556, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8720329999923706, + "num_tokens": 454163961.0, + "step": 11902 + }, + { + "epoch": 1.514183946062842, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2689688205718994, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8567328453063965, + "num_tokens": 454200410.0, + "step": 11903 + }, + { + "epoch": 1.5143111563414324, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0229902267456055, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8499253392219543, + "num_tokens": 454237143.0, + "step": 11904 + }, + { + "epoch": 1.514438366620023, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0010757446289062, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.871129035949707, + "num_tokens": 454274022.0, + "step": 11905 + }, + { + "epoch": 1.5145655768986135, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9726052284240723, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8580130338668823, + "num_tokens": 454307955.0, + "step": 11906 + }, + { + "epoch": 1.514692787177204, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 6.429234981536865, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8717786073684692, + "num_tokens": 454345449.0, + "step": 11907 + }, + { + "epoch": 1.5148199974557945, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2011756896972656, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8513731956481934, + "num_tokens": 454383096.0, + "step": 11908 + }, + { + "epoch": 1.514947207734385, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.019260883331299, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8517404794692993, + "num_tokens": 454420142.0, + "step": 11909 + }, + { + "epoch": 1.5150744180129756, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0245752334594727, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8631862998008728, + "num_tokens": 454454991.0, + "step": 11910 + }, + { + "epoch": 1.515201628291566, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0925636291503906, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8604625463485718, + "num_tokens": 454490161.0, + "step": 11911 + }, + { + "epoch": 1.5153288385701564, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8098894357681274, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8557924628257751, + "num_tokens": 454534767.0, + "step": 11912 + }, + { + "epoch": 1.515456048848747, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0130069255828857, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8726078867912292, + "num_tokens": 454573246.0, + "step": 11913 + }, + { + "epoch": 1.5155832591273375, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.980833649635315, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8773895502090454, + "num_tokens": 454607626.0, + "step": 11914 + }, + { + "epoch": 1.515710469405928, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0284059047698975, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8588692545890808, + "num_tokens": 454639840.0, + "step": 11915 + }, + { + "epoch": 1.5158376796845185, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9423505067825317, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8553040027618408, + "num_tokens": 454681445.0, + "step": 11916 + }, + { + "epoch": 1.515964889963109, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.815953016281128, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8569574356079102, + "num_tokens": 454726191.0, + "step": 11917 + }, + { + "epoch": 1.5160921002416994, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8658329248428345, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8700150847434998, + "num_tokens": 454765112.0, + "step": 11918 + }, + { + "epoch": 1.51621931052029, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9066307544708252, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8601588010787964, + "num_tokens": 454805254.0, + "step": 11919 + }, + { + "epoch": 1.5163465207988804, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9685639142990112, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8570578694343567, + "num_tokens": 454841344.0, + "step": 11920 + }, + { + "epoch": 1.516473731077471, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.816759705543518, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8765320181846619, + "num_tokens": 454879986.0, + "step": 11921 + }, + { + "epoch": 1.5166009413560615, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0007565021514893, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8549535870552063, + "num_tokens": 454917731.0, + "step": 11922 + }, + { + "epoch": 1.516728151634652, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9380966424942017, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8691751956939697, + "num_tokens": 454952837.0, + "step": 11923 + }, + { + "epoch": 1.5168553619132426, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8406543731689453, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8661524057388306, + "num_tokens": 454992751.0, + "step": 11924 + }, + { + "epoch": 1.516982572191833, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9033701419830322, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8623824119567871, + "num_tokens": 455029829.0, + "step": 11925 + }, + { + "epoch": 1.5171097824704236, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9051257371902466, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8636514544487, + "num_tokens": 455072167.0, + "step": 11926 + }, + { + "epoch": 1.5172369927490141, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9518084526062012, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8716839551925659, + "num_tokens": 455108433.0, + "step": 11927 + }, + { + "epoch": 1.5173642030276047, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.901682734489441, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8680487871170044, + "num_tokens": 455148943.0, + "step": 11928 + }, + { + "epoch": 1.5174914133061952, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0512218475341797, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8592360019683838, + "num_tokens": 455185099.0, + "step": 11929 + }, + { + "epoch": 1.5176186235847857, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.930222988128662, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8789457082748413, + "num_tokens": 455225322.0, + "step": 11930 + }, + { + "epoch": 1.5177458338633762, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9392421245574951, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8607404828071594, + "num_tokens": 455262830.0, + "step": 11931 + }, + { + "epoch": 1.5178730441419668, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.828802227973938, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8718704581260681, + "num_tokens": 455300825.0, + "step": 11932 + }, + { + "epoch": 1.5180002544205573, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8440529108047485, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8601427674293518, + "num_tokens": 455336209.0, + "step": 11933 + }, + { + "epoch": 1.5181274646991478, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9230282306671143, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8630241751670837, + "num_tokens": 455374909.0, + "step": 11934 + }, + { + "epoch": 1.5182546749777384, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7721619606018066, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8767154216766357, + "num_tokens": 455417179.0, + "step": 11935 + }, + { + "epoch": 1.5183818852563287, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0433719158172607, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8502867221832275, + "num_tokens": 455457923.0, + "step": 11936 + }, + { + "epoch": 1.5185090955349192, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.00777530670166, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8552569150924683, + "num_tokens": 455492932.0, + "step": 11937 + }, + { + "epoch": 1.5186363058135097, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7750221490859985, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8543665409088135, + "num_tokens": 455540392.0, + "step": 11938 + }, + { + "epoch": 1.5187635160921003, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.045384168624878, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8588602542877197, + "num_tokens": 455574461.0, + "step": 11939 + }, + { + "epoch": 1.5188907263706908, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.968955159187317, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8609817028045654, + "num_tokens": 455608315.0, + "step": 11940 + }, + { + "epoch": 1.5190179366492813, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.77228581905365, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8673851490020752, + "num_tokens": 455654927.0, + "step": 11941 + }, + { + "epoch": 1.5191451469278716, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9357320070266724, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8519229292869568, + "num_tokens": 455695745.0, + "step": 11942 + }, + { + "epoch": 1.5192723572064621, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8032461404800415, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8631376028060913, + "num_tokens": 455734652.0, + "step": 11943 + }, + { + "epoch": 1.5193995674850527, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.726135015487671, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8407325148582458, + "num_tokens": 455776063.0, + "step": 11944 + }, + { + "epoch": 1.5195267777636432, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5982656478881836, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8587981462478638, + "num_tokens": 455820595.0, + "step": 11945 + }, + { + "epoch": 1.5196539880422337, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8962000608444214, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8688499927520752, + "num_tokens": 455859553.0, + "step": 11946 + }, + { + "epoch": 1.5197811983208243, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.132988929748535, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.864357590675354, + "num_tokens": 455892676.0, + "step": 11947 + }, + { + "epoch": 1.5199084085994148, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.832004189491272, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8552867770195007, + "num_tokens": 455932106.0, + "step": 11948 + }, + { + "epoch": 1.5200356188780053, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8345123529434204, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8685793280601501, + "num_tokens": 455971555.0, + "step": 11949 + }, + { + "epoch": 1.5201628291565958, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.931320071220398, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.858019232749939, + "num_tokens": 456014177.0, + "step": 11950 + }, + { + "epoch": 1.5202900394351864, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.93850839138031, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8683655858039856, + "num_tokens": 456048229.0, + "step": 11951 + }, + { + "epoch": 1.520417249713777, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9069125652313232, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8827667236328125, + "num_tokens": 456085815.0, + "step": 11952 + }, + { + "epoch": 1.5205444599923674, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7369370460510254, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.865854024887085, + "num_tokens": 456128971.0, + "step": 11953 + }, + { + "epoch": 1.520671670270958, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0530762672424316, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8462420701980591, + "num_tokens": 456159705.0, + "step": 11954 + }, + { + "epoch": 1.5207988805495485, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9223079681396484, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8533881306648254, + "num_tokens": 456198251.0, + "step": 11955 + }, + { + "epoch": 1.520926090828139, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8790571689605713, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8566864132881165, + "num_tokens": 456237612.0, + "step": 11956 + }, + { + "epoch": 1.5210533011067295, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7991715669631958, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8634997606277466, + "num_tokens": 456273665.0, + "step": 11957 + }, + { + "epoch": 1.52118051138532, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0030550956726074, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8652984499931335, + "num_tokens": 456305467.0, + "step": 11958 + }, + { + "epoch": 1.5213077216639106, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8761085271835327, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8694836497306824, + "num_tokens": 456342951.0, + "step": 11959 + }, + { + "epoch": 1.521434931942501, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.099787473678589, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8633303642272949, + "num_tokens": 456375846.0, + "step": 11960 + }, + { + "epoch": 1.5215621422210914, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1151535511016846, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8714812994003296, + "num_tokens": 456407993.0, + "step": 11961 + }, + { + "epoch": 1.521689352499682, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.049489736557007, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8446316719055176, + "num_tokens": 456444278.0, + "step": 11962 + }, + { + "epoch": 1.5218165627782725, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8369622230529785, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.871715784072876, + "num_tokens": 456483950.0, + "step": 11963 + }, + { + "epoch": 1.521943773056863, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.726560354232788, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8642685413360596, + "num_tokens": 456527438.0, + "step": 11964 + }, + { + "epoch": 1.5220709833354535, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9966930150985718, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8752630949020386, + "num_tokens": 456560572.0, + "step": 11965 + }, + { + "epoch": 1.5221981936140438, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9840539693832397, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8541861772537231, + "num_tokens": 456604781.0, + "step": 11966 + }, + { + "epoch": 1.5223254038926344, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2135465145111084, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8764529824256897, + "num_tokens": 456639415.0, + "step": 11967 + }, + { + "epoch": 1.522452614171225, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0341224670410156, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8763108849525452, + "num_tokens": 456671183.0, + "step": 11968 + }, + { + "epoch": 1.5225798244498154, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9009369611740112, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8509405255317688, + "num_tokens": 456712995.0, + "step": 11969 + }, + { + "epoch": 1.522707034728406, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8943455219268799, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8584941029548645, + "num_tokens": 456750587.0, + "step": 11970 + }, + { + "epoch": 1.5228342450069965, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.011223554611206, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8698734045028687, + "num_tokens": 456787503.0, + "step": 11971 + }, + { + "epoch": 1.522961455285587, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9794625043869019, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8404470086097717, + "num_tokens": 456831645.0, + "step": 11972 + }, + { + "epoch": 1.5230886655641775, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.84616219997406, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8703025579452515, + "num_tokens": 456869143.0, + "step": 11973 + }, + { + "epoch": 1.523215875842768, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8940036296844482, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8612464666366577, + "num_tokens": 456906465.0, + "step": 11974 + }, + { + "epoch": 1.5233430861213586, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9109724760055542, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8627188205718994, + "num_tokens": 456942048.0, + "step": 11975 + }, + { + "epoch": 1.5234702963999491, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8794351816177368, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8818515539169312, + "num_tokens": 456975710.0, + "step": 11976 + }, + { + "epoch": 1.5235975066785397, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8120125532150269, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8525674343109131, + "num_tokens": 457022295.0, + "step": 11977 + }, + { + "epoch": 1.5237247169571302, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9297568798065186, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8547413945198059, + "num_tokens": 457060433.0, + "step": 11978 + }, + { + "epoch": 1.5238519272357207, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8354769945144653, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8697386384010315, + "num_tokens": 457101760.0, + "step": 11979 + }, + { + "epoch": 1.5239791375143112, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8069945573806763, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8790808320045471, + "num_tokens": 457142676.0, + "step": 11980 + }, + { + "epoch": 1.5241063477929018, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8602650165557861, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8698017001152039, + "num_tokens": 457179507.0, + "step": 11981 + }, + { + "epoch": 1.5242335580714923, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7087827920913696, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8647492527961731, + "num_tokens": 457221563.0, + "step": 11982 + }, + { + "epoch": 1.5243607683500828, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9433366060256958, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8609607219696045, + "num_tokens": 457258634.0, + "step": 11983 + }, + { + "epoch": 1.5244879786286734, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8818269968032837, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8779599666595459, + "num_tokens": 457293097.0, + "step": 11984 + }, + { + "epoch": 1.5246151889072637, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8072519302368164, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8776064515113831, + "num_tokens": 457333525.0, + "step": 11985 + }, + { + "epoch": 1.5247423991858542, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7262859344482422, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8500916361808777, + "num_tokens": 457379772.0, + "step": 11986 + }, + { + "epoch": 1.5248696094644447, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8040406703948975, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8688015937805176, + "num_tokens": 457415852.0, + "step": 11987 + }, + { + "epoch": 1.5249968197430352, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.048241376876831, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8677672147750854, + "num_tokens": 457449771.0, + "step": 11988 + }, + { + "epoch": 1.5251240300216258, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.071445941925049, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8600065112113953, + "num_tokens": 457489601.0, + "step": 11989 + }, + { + "epoch": 1.5252512403002163, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8207142353057861, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.877839982509613, + "num_tokens": 457530516.0, + "step": 11990 + }, + { + "epoch": 1.5253784505788066, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.026597261428833, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8589974641799927, + "num_tokens": 457568299.0, + "step": 11991 + }, + { + "epoch": 1.5255056608573971, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.024129867553711, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8626078367233276, + "num_tokens": 457600476.0, + "step": 11992 + }, + { + "epoch": 1.5256328711359877, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9279824495315552, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8831430077552795, + "num_tokens": 457632024.0, + "step": 11993 + }, + { + "epoch": 1.5257600814145782, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0126454830169678, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8698039650917053, + "num_tokens": 457670805.0, + "step": 11994 + }, + { + "epoch": 1.5258872916931687, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1377110481262207, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8641356825828552, + "num_tokens": 457702468.0, + "step": 11995 + }, + { + "epoch": 1.5260145019717593, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7893894910812378, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8729832172393799, + "num_tokens": 457740443.0, + "step": 11996 + }, + { + "epoch": 1.5261417122503498, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9565821886062622, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8663642406463623, + "num_tokens": 457775833.0, + "step": 11997 + }, + { + "epoch": 1.5262689225289403, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2852461338043213, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8691797256469727, + "num_tokens": 457815850.0, + "step": 11998 + }, + { + "epoch": 1.5263961328075308, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8369773626327515, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8710812330245972, + "num_tokens": 457854946.0, + "step": 11999 + }, + { + "epoch": 1.5265233430861214, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8351492881774902, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8692283034324646, + "num_tokens": 457890775.0, + "step": 12000 + }, + { + "epoch": 1.526650553364712, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9390498399734497, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8689804077148438, + "num_tokens": 457928143.0, + "step": 12001 + }, + { + "epoch": 1.5267777636433024, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7847537994384766, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8707786798477173, + "num_tokens": 457965999.0, + "step": 12002 + }, + { + "epoch": 1.526904973921893, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8298027515411377, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8636831045150757, + "num_tokens": 458006638.0, + "step": 12003 + }, + { + "epoch": 1.5270321842004835, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7392956018447876, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8652276992797852, + "num_tokens": 458047522.0, + "step": 12004 + }, + { + "epoch": 1.527159394479074, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8169842958450317, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.864861249923706, + "num_tokens": 458091219.0, + "step": 12005 + }, + { + "epoch": 1.5272866047576645, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0088393688201904, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8533399105072021, + "num_tokens": 458129208.0, + "step": 12006 + }, + { + "epoch": 1.527413815036255, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9677072763442993, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8596959114074707, + "num_tokens": 458163632.0, + "step": 12007 + }, + { + "epoch": 1.5275410253148456, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.982387900352478, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8586320281028748, + "num_tokens": 458202377.0, + "step": 12008 + }, + { + "epoch": 1.527668235593436, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9829367399215698, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8643167018890381, + "num_tokens": 458239901.0, + "step": 12009 + }, + { + "epoch": 1.5277954458720264, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7907185554504395, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8584447503089905, + "num_tokens": 458280291.0, + "step": 12010 + }, + { + "epoch": 1.527922656150617, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4022510051727295, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8688766360282898, + "num_tokens": 458316931.0, + "step": 12011 + }, + { + "epoch": 1.5280498664292075, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8015861511230469, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8588658571243286, + "num_tokens": 458356477.0, + "step": 12012 + }, + { + "epoch": 1.528177076707798, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8526029586791992, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8491580486297607, + "num_tokens": 458401340.0, + "step": 12013 + }, + { + "epoch": 1.5283042869863885, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9549894332885742, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.870545506477356, + "num_tokens": 458436190.0, + "step": 12014 + }, + { + "epoch": 1.5284314972649788, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1097028255462646, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8646421432495117, + "num_tokens": 458465513.0, + "step": 12015 + }, + { + "epoch": 1.5285587075435694, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8612983226776123, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8614856004714966, + "num_tokens": 458505465.0, + "step": 12016 + }, + { + "epoch": 1.52868591782216, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.956743836402893, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8623175621032715, + "num_tokens": 458545087.0, + "step": 12017 + }, + { + "epoch": 1.5288131281007504, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4188339710235596, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8722367882728577, + "num_tokens": 458583274.0, + "step": 12018 + }, + { + "epoch": 1.528940338379341, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7669756412506104, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8594878911972046, + "num_tokens": 458626026.0, + "step": 12019 + }, + { + "epoch": 1.5290675486579315, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6767760515213013, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8763495683670044, + "num_tokens": 458668774.0, + "step": 12020 + }, + { + "epoch": 1.529194758936522, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9326506853103638, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8682184815406799, + "num_tokens": 458702256.0, + "step": 12021 + }, + { + "epoch": 1.5293219692151125, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9110937118530273, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.8380232453346252, + "num_tokens": 458741429.0, + "step": 12022 + }, + { + "epoch": 1.529449179493703, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8345190286636353, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8678790926933289, + "num_tokens": 458775608.0, + "step": 12023 + }, + { + "epoch": 1.5295763897722936, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8562805652618408, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8675076961517334, + "num_tokens": 458811451.0, + "step": 12024 + }, + { + "epoch": 1.5297036000508841, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7801822423934937, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8692998886108398, + "num_tokens": 458851761.0, + "step": 12025 + }, + { + "epoch": 1.5298308103294747, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.970733880996704, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8579029440879822, + "num_tokens": 458888948.0, + "step": 12026 + }, + { + "epoch": 1.5299580206080652, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0127100944519043, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8621770739555359, + "num_tokens": 458922466.0, + "step": 12027 + }, + { + "epoch": 1.5300852308866557, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.974383592605591, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8594964146614075, + "num_tokens": 458956947.0, + "step": 12028 + }, + { + "epoch": 1.5302124411652462, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0667927265167236, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8822042942047119, + "num_tokens": 458993634.0, + "step": 12029 + }, + { + "epoch": 1.5303396514438368, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.747559666633606, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8729382753372192, + "num_tokens": 459034314.0, + "step": 12030 + }, + { + "epoch": 1.5304668617224273, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8967788219451904, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8706483840942383, + "num_tokens": 459072117.0, + "step": 12031 + }, + { + "epoch": 1.5305940720010178, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8772351741790771, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8588249683380127, + "num_tokens": 459112382.0, + "step": 12032 + }, + { + "epoch": 1.5307212822796084, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7611842155456543, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8768218755722046, + "num_tokens": 459148973.0, + "step": 12033 + }, + { + "epoch": 1.5308484925581987, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.024174690246582, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8610728979110718, + "num_tokens": 459183242.0, + "step": 12034 + }, + { + "epoch": 1.5309757028367892, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.819901466369629, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8830359578132629, + "num_tokens": 459219606.0, + "step": 12035 + }, + { + "epoch": 1.5311029131153797, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9000403881072998, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8825671672821045, + "num_tokens": 459253763.0, + "step": 12036 + }, + { + "epoch": 1.5312301233939702, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9159201383590698, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8676171898841858, + "num_tokens": 459291187.0, + "step": 12037 + }, + { + "epoch": 1.5313573336725608, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9160529375076294, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8615767955780029, + "num_tokens": 459327423.0, + "step": 12038 + }, + { + "epoch": 1.5314845439511513, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8753823041915894, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8610795736312866, + "num_tokens": 459366207.0, + "step": 12039 + }, + { + "epoch": 1.5316117542297416, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 8.645359992980957, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8572754859924316, + "num_tokens": 459408338.0, + "step": 12040 + }, + { + "epoch": 1.5317389645083321, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0616047382354736, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8746674060821533, + "num_tokens": 459443420.0, + "step": 12041 + }, + { + "epoch": 1.5318661747869227, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1120338439941406, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8478268384933472, + "num_tokens": 459475835.0, + "step": 12042 + }, + { + "epoch": 1.5319933850655132, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0597081184387207, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8620681762695312, + "num_tokens": 459511993.0, + "step": 12043 + }, + { + "epoch": 1.5321205953441037, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9454110860824585, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8719432353973389, + "num_tokens": 459546221.0, + "step": 12044 + }, + { + "epoch": 1.5322478056226942, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6836673021316528, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8685517311096191, + "num_tokens": 459587271.0, + "step": 12045 + }, + { + "epoch": 1.5323750159012848, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8506886959075928, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8538601994514465, + "num_tokens": 459623388.0, + "step": 12046 + }, + { + "epoch": 1.5325022261798753, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9454268217086792, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8650020360946655, + "num_tokens": 459666440.0, + "step": 12047 + }, + { + "epoch": 1.5326294364584658, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8650846481323242, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8514781594276428, + "num_tokens": 459708373.0, + "step": 12048 + }, + { + "epoch": 1.5327566467370564, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9769988059997559, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.875566840171814, + "num_tokens": 459746707.0, + "step": 12049 + }, + { + "epoch": 1.532883857015647, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.781012773513794, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8709319829940796, + "num_tokens": 459792493.0, + "step": 12050 + }, + { + "epoch": 1.5330110672942374, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7863305807113647, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8724942207336426, + "num_tokens": 459828897.0, + "step": 12051 + }, + { + "epoch": 1.533138277572828, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1000235080718994, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.856503963470459, + "num_tokens": 459866274.0, + "step": 12052 + }, + { + "epoch": 1.5332654878514185, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9722539186477661, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8800389766693115, + "num_tokens": 459902442.0, + "step": 12053 + }, + { + "epoch": 1.533392698130009, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9056975841522217, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.87196284532547, + "num_tokens": 459943331.0, + "step": 12054 + }, + { + "epoch": 1.5335199084085995, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9345464706420898, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.870398998260498, + "num_tokens": 459975349.0, + "step": 12055 + }, + { + "epoch": 1.53364711868719, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9428297281265259, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8785458207130432, + "num_tokens": 460008295.0, + "step": 12056 + }, + { + "epoch": 1.5337743289657806, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0438232421875, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8691357374191284, + "num_tokens": 460045892.0, + "step": 12057 + }, + { + "epoch": 1.533901539244371, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1906819343566895, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8653173446655273, + "num_tokens": 460084255.0, + "step": 12058 + }, + { + "epoch": 1.5340287495229614, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.818325400352478, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8510465025901794, + "num_tokens": 460123706.0, + "step": 12059 + }, + { + "epoch": 1.534155959801552, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8029274940490723, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8506560325622559, + "num_tokens": 460162351.0, + "step": 12060 + }, + { + "epoch": 1.5342831700801425, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8960604667663574, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8558897972106934, + "num_tokens": 460200135.0, + "step": 12061 + }, + { + "epoch": 1.534410380358733, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6744310855865479, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8586090803146362, + "num_tokens": 460243585.0, + "step": 12062 + }, + { + "epoch": 1.5345375906373235, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8696825504302979, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8575010895729065, + "num_tokens": 460283695.0, + "step": 12063 + }, + { + "epoch": 1.5346648009159138, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0552456378936768, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8730095624923706, + "num_tokens": 460317241.0, + "step": 12064 + }, + { + "epoch": 1.5347920111945044, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8269970417022705, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8671469688415527, + "num_tokens": 460351900.0, + "step": 12065 + }, + { + "epoch": 1.534919221473095, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.043607234954834, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8662630915641785, + "num_tokens": 460391672.0, + "step": 12066 + }, + { + "epoch": 1.5350464317516854, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8840197324752808, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8579070568084717, + "num_tokens": 460434120.0, + "step": 12067 + }, + { + "epoch": 1.535173642030276, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0537467002868652, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8666514754295349, + "num_tokens": 460474214.0, + "step": 12068 + }, + { + "epoch": 1.5353008523088665, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0236504077911377, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8702303767204285, + "num_tokens": 460507549.0, + "step": 12069 + }, + { + "epoch": 1.535428062587457, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1553170680999756, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8581355810165405, + "num_tokens": 460546391.0, + "step": 12070 + }, + { + "epoch": 1.5355552728660475, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9571166038513184, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8541719913482666, + "num_tokens": 460585232.0, + "step": 12071 + }, + { + "epoch": 1.535682483144638, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8344449996948242, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8721789717674255, + "num_tokens": 460624398.0, + "step": 12072 + }, + { + "epoch": 1.5358096934232286, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.804634928703308, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8569216728210449, + "num_tokens": 460667193.0, + "step": 12073 + }, + { + "epoch": 1.5359369037018191, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1183695793151855, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8412036299705505, + "num_tokens": 460710614.0, + "step": 12074 + }, + { + "epoch": 1.5360641139804097, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.110797643661499, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8606597185134888, + "num_tokens": 460751079.0, + "step": 12075 + }, + { + "epoch": 1.5361913242590002, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9119824171066284, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8655637502670288, + "num_tokens": 460786768.0, + "step": 12076 + }, + { + "epoch": 1.5363185345375907, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.841366171836853, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8538897037506104, + "num_tokens": 460824757.0, + "step": 12077 + }, + { + "epoch": 1.5364457448161812, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8496522903442383, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8574354648590088, + "num_tokens": 460865024.0, + "step": 12078 + }, + { + "epoch": 1.5365729550947718, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.061591386795044, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8543565273284912, + "num_tokens": 460900895.0, + "step": 12079 + }, + { + "epoch": 1.5367001653733623, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.980930209159851, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8691747784614563, + "num_tokens": 460936876.0, + "step": 12080 + }, + { + "epoch": 1.5368273756519528, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.113853931427002, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.848827600479126, + "num_tokens": 460975297.0, + "step": 12081 + }, + { + "epoch": 1.5369545859305433, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1284384727478027, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8526710271835327, + "num_tokens": 461008603.0, + "step": 12082 + }, + { + "epoch": 1.5370817962091337, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0950403213500977, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8563725352287292, + "num_tokens": 461047770.0, + "step": 12083 + }, + { + "epoch": 1.5372090064877242, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.931754231452942, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.850823163986206, + "num_tokens": 461088554.0, + "step": 12084 + }, + { + "epoch": 1.5373362167663147, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8850457668304443, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8419325947761536, + "num_tokens": 461126070.0, + "step": 12085 + }, + { + "epoch": 1.5374634270449052, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8802956342697144, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8614116907119751, + "num_tokens": 461166394.0, + "step": 12086 + }, + { + "epoch": 1.5375906373234958, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7613084316253662, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8562865853309631, + "num_tokens": 461208263.0, + "step": 12087 + }, + { + "epoch": 1.5377178476020863, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7285799980163574, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8753159046173096, + "num_tokens": 461245054.0, + "step": 12088 + }, + { + "epoch": 1.5378450578806766, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4375414848327637, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.854214072227478, + "num_tokens": 461281584.0, + "step": 12089 + }, + { + "epoch": 1.5379722681592671, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 20.45513343811035, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8733664751052856, + "num_tokens": 461322247.0, + "step": 12090 + }, + { + "epoch": 1.5380994784378577, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.2160370349884033, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8513699769973755, + "num_tokens": 461358371.0, + "step": 12091 + }, + { + "epoch": 1.5382266887164482, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.1315994262695312, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8710394501686096, + "num_tokens": 461394666.0, + "step": 12092 + }, + { + "epoch": 1.5383538989950387, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8404062986373901, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8770880699157715, + "num_tokens": 461430875.0, + "step": 12093 + }, + { + "epoch": 1.5384811092736292, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7896499633789062, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8599578142166138, + "num_tokens": 461467412.0, + "step": 12094 + }, + { + "epoch": 1.5386083195522198, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9923852682113647, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8438463807106018, + "num_tokens": 461501970.0, + "step": 12095 + }, + { + "epoch": 1.5387355298308103, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.057793140411377, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.851726233959198, + "num_tokens": 461542735.0, + "step": 12096 + }, + { + "epoch": 1.5388627401094008, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7907724380493164, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8660510778427124, + "num_tokens": 461583883.0, + "step": 12097 + }, + { + "epoch": 1.5389899503879914, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8816850185394287, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8558986186981201, + "num_tokens": 461625124.0, + "step": 12098 + }, + { + "epoch": 1.5391171606665819, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0007951259613037, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8545780181884766, + "num_tokens": 461660273.0, + "step": 12099 + }, + { + "epoch": 1.5392443709451724, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9481024742126465, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8582814335823059, + "num_tokens": 461698890.0, + "step": 12100 + }, + { + "epoch": 1.539371581223763, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9702321290969849, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8634323477745056, + "num_tokens": 461736927.0, + "step": 12101 + }, + { + "epoch": 1.5394987915023535, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7721061706542969, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8602607846260071, + "num_tokens": 461777796.0, + "step": 12102 + }, + { + "epoch": 1.539626001780944, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.257890462875366, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8729679584503174, + "num_tokens": 461814656.0, + "step": 12103 + }, + { + "epoch": 1.5397532120595345, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9486558437347412, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8459492921829224, + "num_tokens": 461852186.0, + "step": 12104 + }, + { + "epoch": 1.539880422338125, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7382570505142212, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8775748014450073, + "num_tokens": 461893724.0, + "step": 12105 + }, + { + "epoch": 1.5400076326167156, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9923518896102905, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8503440618515015, + "num_tokens": 461935380.0, + "step": 12106 + }, + { + "epoch": 1.5401348428953059, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8829588890075684, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8687875866889954, + "num_tokens": 461971463.0, + "step": 12107 + }, + { + "epoch": 1.5402620531738964, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9517463445663452, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8704136610031128, + "num_tokens": 462007514.0, + "step": 12108 + }, + { + "epoch": 1.540389263452487, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.952867031097412, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8769314885139465, + "num_tokens": 462046022.0, + "step": 12109 + }, + { + "epoch": 1.5405164737310775, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7261676788330078, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8680565357208252, + "num_tokens": 462089367.0, + "step": 12110 + }, + { + "epoch": 1.540643684009668, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7017110586166382, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8691779375076294, + "num_tokens": 462128558.0, + "step": 12111 + }, + { + "epoch": 1.5407708942882585, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9494320154190063, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8711352944374084, + "num_tokens": 462162639.0, + "step": 12112 + }, + { + "epoch": 1.5408981045668488, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9597549438476562, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8646750450134277, + "num_tokens": 462199303.0, + "step": 12113 + }, + { + "epoch": 1.5410253148454394, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7548149824142456, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8602839708328247, + "num_tokens": 462246095.0, + "step": 12114 + }, + { + "epoch": 1.54115252512403, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3294098377227783, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8599867224693298, + "num_tokens": 462286653.0, + "step": 12115 + }, + { + "epoch": 1.5412797354026204, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2537131309509277, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.860723614692688, + "num_tokens": 462328295.0, + "step": 12116 + }, + { + "epoch": 1.541406945681211, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.129042863845825, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.864833652973175, + "num_tokens": 462365175.0, + "step": 12117 + }, + { + "epoch": 1.5415341559598015, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8735278844833374, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8552548885345459, + "num_tokens": 462407044.0, + "step": 12118 + }, + { + "epoch": 1.541661366238392, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9225798845291138, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8569962978363037, + "num_tokens": 462443819.0, + "step": 12119 + }, + { + "epoch": 1.5417885765169825, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6965452432632446, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8732010126113892, + "num_tokens": 462487319.0, + "step": 12120 + }, + { + "epoch": 1.541915786795573, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8304206132888794, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8648568391799927, + "num_tokens": 462527026.0, + "step": 12121 + }, + { + "epoch": 1.5420429970741636, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.105053424835205, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8567611575126648, + "num_tokens": 462557552.0, + "step": 12122 + }, + { + "epoch": 1.5421702073527541, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.801314353942871, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.865451455116272, + "num_tokens": 462595243.0, + "step": 12123 + }, + { + "epoch": 1.5422974176313446, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8530309200286865, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8663049936294556, + "num_tokens": 462636077.0, + "step": 12124 + }, + { + "epoch": 1.5424246279099352, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8810786008834839, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8753644824028015, + "num_tokens": 462668407.0, + "step": 12125 + }, + { + "epoch": 1.5425518381885257, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9484672546386719, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8648344278335571, + "num_tokens": 462707678.0, + "step": 12126 + }, + { + "epoch": 1.5426790484671162, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.122619390487671, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8563094735145569, + "num_tokens": 462739722.0, + "step": 12127 + }, + { + "epoch": 1.5428062587457068, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9723386764526367, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8717721104621887, + "num_tokens": 462770369.0, + "step": 12128 + }, + { + "epoch": 1.5429334690242973, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.075941801071167, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8700297474861145, + "num_tokens": 462806126.0, + "step": 12129 + }, + { + "epoch": 1.5430606793028878, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7823688983917236, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8661562204360962, + "num_tokens": 462846585.0, + "step": 12130 + }, + { + "epoch": 1.5431878895814783, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.926807165145874, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.859494686126709, + "num_tokens": 462889892.0, + "step": 12131 + }, + { + "epoch": 1.5433150998600687, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.903450846672058, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8632526397705078, + "num_tokens": 462927612.0, + "step": 12132 + }, + { + "epoch": 1.5434423101386592, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7385586500167847, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8665156960487366, + "num_tokens": 462971661.0, + "step": 12133 + }, + { + "epoch": 1.5435695204172497, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8539446592330933, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8686522245407104, + "num_tokens": 463011260.0, + "step": 12134 + }, + { + "epoch": 1.5436967306958402, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9653186798095703, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.869563639163971, + "num_tokens": 463045395.0, + "step": 12135 + }, + { + "epoch": 1.5438239409744308, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8922183513641357, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8538311719894409, + "num_tokens": 463081015.0, + "step": 12136 + }, + { + "epoch": 1.5439511512530213, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1185014247894287, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8626385927200317, + "num_tokens": 463115658.0, + "step": 12137 + }, + { + "epoch": 1.5440783615316116, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.808517575263977, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8744207620620728, + "num_tokens": 463152084.0, + "step": 12138 + }, + { + "epoch": 1.5442055718102021, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8942837715148926, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8547428846359253, + "num_tokens": 463190137.0, + "step": 12139 + }, + { + "epoch": 1.5443327820887927, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7827578783035278, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8643300533294678, + "num_tokens": 463227626.0, + "step": 12140 + }, + { + "epoch": 1.5444599923673832, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8352516889572144, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.865086555480957, + "num_tokens": 463266451.0, + "step": 12141 + }, + { + "epoch": 1.5445872026459737, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3312697410583496, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8507620692253113, + "num_tokens": 463305327.0, + "step": 12142 + }, + { + "epoch": 1.5447144129245642, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.870985507965088, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8603267073631287, + "num_tokens": 463345890.0, + "step": 12143 + }, + { + "epoch": 1.5448416232031548, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.868558406829834, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8694841265678406, + "num_tokens": 463381583.0, + "step": 12144 + }, + { + "epoch": 1.5449688334817453, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9180753231048584, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8725500702857971, + "num_tokens": 463414442.0, + "step": 12145 + }, + { + "epoch": 1.5450960437603358, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9195722341537476, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8682005405426025, + "num_tokens": 463449517.0, + "step": 12146 + }, + { + "epoch": 1.5452232540389264, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.708589792251587, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.870536208152771, + "num_tokens": 463487952.0, + "step": 12147 + }, + { + "epoch": 1.5453504643175169, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.019355058670044, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8652232885360718, + "num_tokens": 463520724.0, + "step": 12148 + }, + { + "epoch": 1.5454776745961074, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9032825231552124, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8606318235397339, + "num_tokens": 463564691.0, + "step": 12149 + }, + { + "epoch": 1.545604884874698, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.05708384513855, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8598476052284241, + "num_tokens": 463599037.0, + "step": 12150 + }, + { + "epoch": 1.5457320951532885, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0512242317199707, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8490726947784424, + "num_tokens": 463630744.0, + "step": 12151 + }, + { + "epoch": 1.545859305431879, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.937235951423645, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.860952615737915, + "num_tokens": 463663551.0, + "step": 12152 + }, + { + "epoch": 1.5459865157104695, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7800941467285156, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.876289427280426, + "num_tokens": 463702446.0, + "step": 12153 + }, + { + "epoch": 1.54611372598906, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9537129402160645, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8542397618293762, + "num_tokens": 463739324.0, + "step": 12154 + }, + { + "epoch": 1.5462409362676506, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1404051780700684, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8446815609931946, + "num_tokens": 463780765.0, + "step": 12155 + }, + { + "epoch": 1.5463681465462409, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9762201309204102, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8431127071380615, + "num_tokens": 463817727.0, + "step": 12156 + }, + { + "epoch": 1.5464953568248314, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7408615350723267, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8787319660186768, + "num_tokens": 463865793.0, + "step": 12157 + }, + { + "epoch": 1.546622567103422, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.794974446296692, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8649147748947144, + "num_tokens": 463908299.0, + "step": 12158 + }, + { + "epoch": 1.5467497773820125, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.812364101409912, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8601845502853394, + "num_tokens": 463944513.0, + "step": 12159 + }, + { + "epoch": 1.546876987660603, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8101742267608643, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8567067384719849, + "num_tokens": 463982862.0, + "step": 12160 + }, + { + "epoch": 1.5470041979391935, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6833523511886597, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8695319890975952, + "num_tokens": 464023357.0, + "step": 12161 + }, + { + "epoch": 1.5471314082177838, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7637470960617065, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8633751273155212, + "num_tokens": 464071889.0, + "step": 12162 + }, + { + "epoch": 1.5472586184963744, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8713984489440918, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8504776954650879, + "num_tokens": 464112334.0, + "step": 12163 + }, + { + "epoch": 1.5473858287749649, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9941684007644653, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8584669828414917, + "num_tokens": 464156833.0, + "step": 12164 + }, + { + "epoch": 1.5475130390535554, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9485561847686768, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8654208183288574, + "num_tokens": 464192989.0, + "step": 12165 + }, + { + "epoch": 1.547640249332146, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7575515508651733, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8683456182479858, + "num_tokens": 464232855.0, + "step": 12166 + }, + { + "epoch": 1.5477674596107365, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9506491422653198, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8472745418548584, + "num_tokens": 464271860.0, + "step": 12167 + }, + { + "epoch": 1.547894669889327, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.029233932495117, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8762854933738708, + "num_tokens": 464312925.0, + "step": 12168 + }, + { + "epoch": 1.5480218801679175, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9447282552719116, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8581609129905701, + "num_tokens": 464349190.0, + "step": 12169 + }, + { + "epoch": 1.548149090446508, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.714593768119812, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8583664298057556, + "num_tokens": 464392031.0, + "step": 12170 + }, + { + "epoch": 1.5482763007250986, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9453684091567993, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8596540689468384, + "num_tokens": 464424884.0, + "step": 12171 + }, + { + "epoch": 1.5484035110036891, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.140465259552002, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8582524061203003, + "num_tokens": 464466047.0, + "step": 12172 + }, + { + "epoch": 1.5485307212822796, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0713562965393066, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.862937331199646, + "num_tokens": 464497293.0, + "step": 12173 + }, + { + "epoch": 1.5486579315608702, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8679865598678589, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8481771945953369, + "num_tokens": 464541873.0, + "step": 12174 + }, + { + "epoch": 1.5487851418394607, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.980135440826416, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8637211322784424, + "num_tokens": 464579712.0, + "step": 12175 + }, + { + "epoch": 1.5489123521180512, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0298776626586914, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8612591624259949, + "num_tokens": 464612277.0, + "step": 12176 + }, + { + "epoch": 1.5490395623966418, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.942317247390747, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8601279854774475, + "num_tokens": 464647062.0, + "step": 12177 + }, + { + "epoch": 1.5491667726752323, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9009087085723877, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8661654591560364, + "num_tokens": 464677998.0, + "step": 12178 + }, + { + "epoch": 1.5492939829538228, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9302033185958862, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.855837345123291, + "num_tokens": 464719205.0, + "step": 12179 + }, + { + "epoch": 1.5494211932324133, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.673902153968811, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8682907819747925, + "num_tokens": 464760758.0, + "step": 12180 + }, + { + "epoch": 1.5495484035110036, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.4294698238372803, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8667149543762207, + "num_tokens": 464797422.0, + "step": 12181 + }, + { + "epoch": 1.5496756137895942, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9557318687438965, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8638631105422974, + "num_tokens": 464832889.0, + "step": 12182 + }, + { + "epoch": 1.5498028240681847, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0370802879333496, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8565599918365479, + "num_tokens": 464867743.0, + "step": 12183 + }, + { + "epoch": 1.5499300343467752, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8743585348129272, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8517945408821106, + "num_tokens": 464904666.0, + "step": 12184 + }, + { + "epoch": 1.5500572446253658, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8819966316223145, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8672583103179932, + "num_tokens": 464942886.0, + "step": 12185 + }, + { + "epoch": 1.5501844549039563, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0459365844726562, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8634418845176697, + "num_tokens": 464977463.0, + "step": 12186 + }, + { + "epoch": 1.5503116651825466, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9139113426208496, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8731673955917358, + "num_tokens": 465015732.0, + "step": 12187 + }, + { + "epoch": 1.5504388754611371, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8125993013381958, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8726194500923157, + "num_tokens": 465060789.0, + "step": 12188 + }, + { + "epoch": 1.5505660857397277, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.807600498199463, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8601388931274414, + "num_tokens": 465097907.0, + "step": 12189 + }, + { + "epoch": 1.5506932960183182, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7349324226379395, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8715354204177856, + "num_tokens": 465139401.0, + "step": 12190 + }, + { + "epoch": 1.5508205062969087, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9317184686660767, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8424475193023682, + "num_tokens": 465176531.0, + "step": 12191 + }, + { + "epoch": 1.5509477165754992, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9298924207687378, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8735845685005188, + "num_tokens": 465216074.0, + "step": 12192 + }, + { + "epoch": 1.5510749268540898, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8036324977874756, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8764957189559937, + "num_tokens": 465257986.0, + "step": 12193 + }, + { + "epoch": 1.5512021371326803, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0523624420166016, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8667217493057251, + "num_tokens": 465296038.0, + "step": 12194 + }, + { + "epoch": 1.5513293474112708, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.874420166015625, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8397536873817444, + "num_tokens": 465334215.0, + "step": 12195 + }, + { + "epoch": 1.5514565576898613, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8626060485839844, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8619687557220459, + "num_tokens": 465373362.0, + "step": 12196 + }, + { + "epoch": 1.5515837679684519, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9903230667114258, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8577383756637573, + "num_tokens": 465410181.0, + "step": 12197 + }, + { + "epoch": 1.5517109782470424, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2149510383605957, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8711570501327515, + "num_tokens": 465443056.0, + "step": 12198 + }, + { + "epoch": 1.551838188525633, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8416098356246948, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8677588701248169, + "num_tokens": 465484509.0, + "step": 12199 + }, + { + "epoch": 1.5519653988042235, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8769690990447998, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8625326752662659, + "num_tokens": 465520017.0, + "step": 12200 + }, + { + "epoch": 1.552092609082814, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8445816040039062, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8796877264976501, + "num_tokens": 465557221.0, + "step": 12201 + }, + { + "epoch": 1.5522198193614045, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8033325672149658, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8720880746841431, + "num_tokens": 465595455.0, + "step": 12202 + }, + { + "epoch": 1.552347029639995, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8322515487670898, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8646171689033508, + "num_tokens": 465636436.0, + "step": 12203 + }, + { + "epoch": 1.5524742399185856, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9889538288116455, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8599229454994202, + "num_tokens": 465668956.0, + "step": 12204 + }, + { + "epoch": 1.5526014501971759, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9604620933532715, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8455988168716431, + "num_tokens": 465708812.0, + "step": 12205 + }, + { + "epoch": 1.5527286604757664, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2722716331481934, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8522517681121826, + "num_tokens": 465742610.0, + "step": 12206 + }, + { + "epoch": 1.552855870754357, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1078317165374756, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8617053031921387, + "num_tokens": 465776468.0, + "step": 12207 + }, + { + "epoch": 1.5529830810329475, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.210094928741455, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8792120218276978, + "num_tokens": 465809300.0, + "step": 12208 + }, + { + "epoch": 1.553110291311538, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8512349128723145, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8645733594894409, + "num_tokens": 465849847.0, + "step": 12209 + }, + { + "epoch": 1.5532375015901285, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.044025421142578, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8742942810058594, + "num_tokens": 465884986.0, + "step": 12210 + }, + { + "epoch": 1.5533647118687188, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.064767599105835, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8726343512535095, + "num_tokens": 465919710.0, + "step": 12211 + }, + { + "epoch": 1.5534919221473094, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0686769485473633, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8557156920433044, + "num_tokens": 465956041.0, + "step": 12212 + }, + { + "epoch": 1.5536191324258999, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.766296625137329, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8746829628944397, + "num_tokens": 465994352.0, + "step": 12213 + }, + { + "epoch": 1.5537463427044904, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9886741638183594, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8655793070793152, + "num_tokens": 466032493.0, + "step": 12214 + }, + { + "epoch": 1.553873552983081, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1414077281951904, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8644534349441528, + "num_tokens": 466067048.0, + "step": 12215 + }, + { + "epoch": 1.5540007632616715, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9783580303192139, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8405936360359192, + "num_tokens": 466106895.0, + "step": 12216 + }, + { + "epoch": 1.554127973540262, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.777897834777832, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8497397899627686, + "num_tokens": 466146740.0, + "step": 12217 + }, + { + "epoch": 1.5542551838188525, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8287839889526367, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8701872825622559, + "num_tokens": 466187097.0, + "step": 12218 + }, + { + "epoch": 1.554382394097443, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1809775829315186, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.853468656539917, + "num_tokens": 466223969.0, + "step": 12219 + }, + { + "epoch": 1.5545096043760336, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9821109771728516, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8746054768562317, + "num_tokens": 466260127.0, + "step": 12220 + }, + { + "epoch": 1.554636814654624, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8501293659210205, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8695357441902161, + "num_tokens": 466297219.0, + "step": 12221 + }, + { + "epoch": 1.5547640249332146, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9890638589859009, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8435032367706299, + "num_tokens": 466329474.0, + "step": 12222 + }, + { + "epoch": 1.5548912352118052, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0262961387634277, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8619748950004578, + "num_tokens": 466372801.0, + "step": 12223 + }, + { + "epoch": 1.5550184454903957, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8812350034713745, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.870194673538208, + "num_tokens": 466412950.0, + "step": 12224 + }, + { + "epoch": 1.5551456557689862, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9520143270492554, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8690205812454224, + "num_tokens": 466448830.0, + "step": 12225 + }, + { + "epoch": 1.5552728660475768, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7495986223220825, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8529917597770691, + "num_tokens": 466489749.0, + "step": 12226 + }, + { + "epoch": 1.5554000763261673, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8652760982513428, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8452745676040649, + "num_tokens": 466531320.0, + "step": 12227 + }, + { + "epoch": 1.5555272866047578, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.016187906265259, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8560552597045898, + "num_tokens": 466570001.0, + "step": 12228 + }, + { + "epoch": 1.5556544968833483, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9294160604476929, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8670583963394165, + "num_tokens": 466607186.0, + "step": 12229 + }, + { + "epoch": 1.5557817071619386, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9334534406661987, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8544540405273438, + "num_tokens": 466649480.0, + "step": 12230 + }, + { + "epoch": 1.5559089174405292, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8984012603759766, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8702830076217651, + "num_tokens": 466688861.0, + "step": 12231 + }, + { + "epoch": 1.5560361277191197, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0849626064300537, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8431479930877686, + "num_tokens": 466723200.0, + "step": 12232 + }, + { + "epoch": 1.5561633379977102, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.022862434387207, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8485552072525024, + "num_tokens": 466759639.0, + "step": 12233 + }, + { + "epoch": 1.5562905482763008, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.017505407333374, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8568891882896423, + "num_tokens": 466795748.0, + "step": 12234 + }, + { + "epoch": 1.5564177585548913, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.80649733543396, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8738962411880493, + "num_tokens": 466832938.0, + "step": 12235 + }, + { + "epoch": 1.5565449688334816, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9631896018981934, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8561158776283264, + "num_tokens": 466868710.0, + "step": 12236 + }, + { + "epoch": 1.5566721791120721, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8838564157485962, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8622482419013977, + "num_tokens": 466903558.0, + "step": 12237 + }, + { + "epoch": 1.5567993893906626, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.871097207069397, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8534058332443237, + "num_tokens": 466946785.0, + "step": 12238 + }, + { + "epoch": 1.5569265996692532, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9015676975250244, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.855305016040802, + "num_tokens": 466988563.0, + "step": 12239 + }, + { + "epoch": 1.5570538099478437, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8738746643066406, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8567919731140137, + "num_tokens": 467025447.0, + "step": 12240 + }, + { + "epoch": 1.5571810202264342, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.044438123703003, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8718194365501404, + "num_tokens": 467060751.0, + "step": 12241 + }, + { + "epoch": 1.5573082305050248, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.882158637046814, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8618552684783936, + "num_tokens": 467101889.0, + "step": 12242 + }, + { + "epoch": 1.5574354407836153, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.945265293121338, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8647956848144531, + "num_tokens": 467136782.0, + "step": 12243 + }, + { + "epoch": 1.5575626510622058, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8975844383239746, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8712108135223389, + "num_tokens": 467174137.0, + "step": 12244 + }, + { + "epoch": 1.5576898613407963, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8037090301513672, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8586345314979553, + "num_tokens": 467213919.0, + "step": 12245 + }, + { + "epoch": 1.5578170716193869, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8118774890899658, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8707207441329956, + "num_tokens": 467254860.0, + "step": 12246 + }, + { + "epoch": 1.5579442818979774, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.781285047531128, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.874218225479126, + "num_tokens": 467296806.0, + "step": 12247 + }, + { + "epoch": 1.558071492176568, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8986420631408691, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8690821528434753, + "num_tokens": 467332882.0, + "step": 12248 + }, + { + "epoch": 1.5581987024551585, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.821689486503601, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8784911632537842, + "num_tokens": 467375088.0, + "step": 12249 + }, + { + "epoch": 1.558325912733749, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1709187030792236, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8623335361480713, + "num_tokens": 467413000.0, + "step": 12250 + }, + { + "epoch": 1.5584531230123395, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 3.0349223613739014, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8782343864440918, + "num_tokens": 467450947.0, + "step": 12251 + }, + { + "epoch": 1.55858033329093, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7798500061035156, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8537445068359375, + "num_tokens": 467493770.0, + "step": 12252 + }, + { + "epoch": 1.5587075435695206, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9830272197723389, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8596630096435547, + "num_tokens": 467531882.0, + "step": 12253 + }, + { + "epoch": 1.5588347538481109, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9205020666122437, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8893396854400635, + "num_tokens": 467566059.0, + "step": 12254 + }, + { + "epoch": 1.5589619641267014, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7788423299789429, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8785120248794556, + "num_tokens": 467603734.0, + "step": 12255 + }, + { + "epoch": 1.559089174405292, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9403350353240967, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8717325329780579, + "num_tokens": 467641224.0, + "step": 12256 + }, + { + "epoch": 1.5592163846838825, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9645040035247803, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8614808320999146, + "num_tokens": 467677576.0, + "step": 12257 + }, + { + "epoch": 1.559343594962473, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8147674798965454, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8716298341751099, + "num_tokens": 467717526.0, + "step": 12258 + }, + { + "epoch": 1.5594708052410635, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8441916704177856, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.858260989189148, + "num_tokens": 467757591.0, + "step": 12259 + }, + { + "epoch": 1.5595980155196538, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9080097675323486, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8500748872756958, + "num_tokens": 467792794.0, + "step": 12260 + }, + { + "epoch": 1.5597252257982444, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 16.599651336669922, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8584825992584229, + "num_tokens": 467829554.0, + "step": 12261 + }, + { + "epoch": 1.5598524360768349, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9971039295196533, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8481969833374023, + "num_tokens": 467871403.0, + "step": 12262 + }, + { + "epoch": 1.5599796463554254, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2134346961975098, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8430043458938599, + "num_tokens": 467906498.0, + "step": 12263 + }, + { + "epoch": 1.560106856634016, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7833133935928345, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8619347810745239, + "num_tokens": 467946426.0, + "step": 12264 + }, + { + "epoch": 1.5602340669126065, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.369860887527466, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8656793832778931, + "num_tokens": 467979930.0, + "step": 12265 + }, + { + "epoch": 1.560361277191197, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.984628677368164, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8576368093490601, + "num_tokens": 468016347.0, + "step": 12266 + }, + { + "epoch": 1.5604884874697875, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.8180220127105713, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8699557781219482, + "num_tokens": 468051446.0, + "step": 12267 + }, + { + "epoch": 1.560615697748378, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.939558506011963, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8793362379074097, + "num_tokens": 468085713.0, + "step": 12268 + }, + { + "epoch": 1.5607429080269686, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9596165418624878, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8760280609130859, + "num_tokens": 468117863.0, + "step": 12269 + }, + { + "epoch": 1.560870118305559, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9000484943389893, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8596283793449402, + "num_tokens": 468153041.0, + "step": 12270 + }, + { + "epoch": 1.5609973285841496, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8952977657318115, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8609957098960876, + "num_tokens": 468193624.0, + "step": 12271 + }, + { + "epoch": 1.5611245388627402, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8669862747192383, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8788328170776367, + "num_tokens": 468227091.0, + "step": 12272 + }, + { + "epoch": 1.5612517491413307, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7687543630599976, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8589683771133423, + "num_tokens": 468265079.0, + "step": 12273 + }, + { + "epoch": 1.5613789594199212, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8238922357559204, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.860740065574646, + "num_tokens": 468307055.0, + "step": 12274 + }, + { + "epoch": 1.5615061696985117, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9360665082931519, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8575531244277954, + "num_tokens": 468344429.0, + "step": 12275 + }, + { + "epoch": 1.5616333799771023, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7986887693405151, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.862684965133667, + "num_tokens": 468379448.0, + "step": 12276 + }, + { + "epoch": 1.5617605902556928, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.010866641998291, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8660117387771606, + "num_tokens": 468423344.0, + "step": 12277 + }, + { + "epoch": 1.5618878005342833, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8111058473587036, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8692730665206909, + "num_tokens": 468461657.0, + "step": 12278 + }, + { + "epoch": 1.5620150108128736, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.7759556770324707, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.878259539604187, + "num_tokens": 468500428.0, + "step": 12279 + }, + { + "epoch": 1.5621422210914642, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0431480407714844, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8508058786392212, + "num_tokens": 468536310.0, + "step": 12280 + }, + { + "epoch": 1.5622694313700547, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9144597053527832, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8614872694015503, + "num_tokens": 468577954.0, + "step": 12281 + }, + { + "epoch": 1.5623966416486452, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9876737594604492, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8549903631210327, + "num_tokens": 468612131.0, + "step": 12282 + }, + { + "epoch": 1.5625238519272358, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8245892524719238, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8587543964385986, + "num_tokens": 468654022.0, + "step": 12283 + }, + { + "epoch": 1.5626510622058263, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9865599870681763, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8534140586853027, + "num_tokens": 468693120.0, + "step": 12284 + }, + { + "epoch": 1.5627782724844166, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9440093040466309, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8633747100830078, + "num_tokens": 468727695.0, + "step": 12285 + }, + { + "epoch": 1.5629054827630071, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9172358512878418, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8588985204696655, + "num_tokens": 468766705.0, + "step": 12286 + }, + { + "epoch": 1.5630326930415976, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8945825099945068, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8547601699829102, + "num_tokens": 468804555.0, + "step": 12287 + }, + { + "epoch": 1.5631599033201882, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.503089427947998, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8591281771659851, + "num_tokens": 468842495.0, + "step": 12288 + }, + { + "epoch": 1.5632871135987787, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.844759225845337, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8613854646682739, + "num_tokens": 468877574.0, + "step": 12289 + }, + { + "epoch": 1.5634143238773692, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8222382068634033, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8741239309310913, + "num_tokens": 468914573.0, + "step": 12290 + }, + { + "epoch": 1.5635415341559598, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8930881023406982, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8734923005104065, + "num_tokens": 468950943.0, + "step": 12291 + }, + { + "epoch": 1.5636687444345503, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9438072443008423, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8785918951034546, + "num_tokens": 468990007.0, + "step": 12292 + }, + { + "epoch": 1.5637959547131408, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9055612087249756, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8651039600372314, + "num_tokens": 469022461.0, + "step": 12293 + }, + { + "epoch": 1.5639231649917313, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9394407272338867, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.86924809217453, + "num_tokens": 469056396.0, + "step": 12294 + }, + { + "epoch": 1.5640503752703219, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7243462800979614, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8728218674659729, + "num_tokens": 469099464.0, + "step": 12295 + }, + { + "epoch": 1.5641775855489124, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.731248140335083, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.871295154094696, + "num_tokens": 469139830.0, + "step": 12296 + }, + { + "epoch": 1.564304795827503, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.780810832977295, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8624780774116516, + "num_tokens": 469177838.0, + "step": 12297 + }, + { + "epoch": 1.5644320061060935, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7073428630828857, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8888297080993652, + "num_tokens": 469218727.0, + "step": 12298 + }, + { + "epoch": 1.564559216384684, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.937973976135254, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8414906859397888, + "num_tokens": 469261186.0, + "step": 12299 + }, + { + "epoch": 1.5646864266632745, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8043087720870972, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.871189296245575, + "num_tokens": 469301191.0, + "step": 12300 + }, + { + "epoch": 1.564813636941865, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8525521755218506, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8692243695259094, + "num_tokens": 469340132.0, + "step": 12301 + }, + { + "epoch": 1.5649408472204556, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.096806764602661, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8832027316093445, + "num_tokens": 469376211.0, + "step": 12302 + }, + { + "epoch": 1.5650680574990459, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9010934829711914, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.853116512298584, + "num_tokens": 469413961.0, + "step": 12303 + }, + { + "epoch": 1.5651952677776364, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0457799434661865, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8626853823661804, + "num_tokens": 469449507.0, + "step": 12304 + }, + { + "epoch": 1.565322478056227, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.509932518005371, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8488038778305054, + "num_tokens": 469486324.0, + "step": 12305 + }, + { + "epoch": 1.5654496883348175, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8922728300094604, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8583039045333862, + "num_tokens": 469523799.0, + "step": 12306 + }, + { + "epoch": 1.565576898613408, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.952303171157837, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8488789796829224, + "num_tokens": 469558661.0, + "step": 12307 + }, + { + "epoch": 1.5657041088919985, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8180325031280518, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8616521954536438, + "num_tokens": 469603087.0, + "step": 12308 + }, + { + "epoch": 1.5658313191705888, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1434519290924072, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8632299900054932, + "num_tokens": 469637344.0, + "step": 12309 + }, + { + "epoch": 1.5659585294491793, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8650916814804077, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.87200927734375, + "num_tokens": 469681127.0, + "step": 12310 + }, + { + "epoch": 1.5660857397277699, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8039963245391846, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8615184426307678, + "num_tokens": 469724238.0, + "step": 12311 + }, + { + "epoch": 1.5662129500063604, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8265466690063477, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8564873933792114, + "num_tokens": 469765967.0, + "step": 12312 + }, + { + "epoch": 1.566340160284951, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.171522617340088, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8646458983421326, + "num_tokens": 469801387.0, + "step": 12313 + }, + { + "epoch": 1.5664673705635415, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.064038038253784, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8638536334037781, + "num_tokens": 469835448.0, + "step": 12314 + }, + { + "epoch": 1.566594580842132, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.904855728149414, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8542866110801697, + "num_tokens": 469876647.0, + "step": 12315 + }, + { + "epoch": 1.5667217911207225, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7511296272277832, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8774111270904541, + "num_tokens": 469918158.0, + "step": 12316 + }, + { + "epoch": 1.566849001399313, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0033645629882812, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8487771153450012, + "num_tokens": 469953691.0, + "step": 12317 + }, + { + "epoch": 1.5669762116779036, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9443942308425903, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8665568232536316, + "num_tokens": 469996174.0, + "step": 12318 + }, + { + "epoch": 1.567103421956494, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.072249412536621, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8498389720916748, + "num_tokens": 470031371.0, + "step": 12319 + }, + { + "epoch": 1.5672306322350846, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.989595651626587, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8658914566040039, + "num_tokens": 470068783.0, + "step": 12320 + }, + { + "epoch": 1.5673578425136752, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9970459938049316, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.846381664276123, + "num_tokens": 470103490.0, + "step": 12321 + }, + { + "epoch": 1.5674850527922657, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0519931316375732, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.859432578086853, + "num_tokens": 470137213.0, + "step": 12322 + }, + { + "epoch": 1.5676122630708562, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0436975955963135, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8671869039535522, + "num_tokens": 470171816.0, + "step": 12323 + }, + { + "epoch": 1.5677394733494467, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8812615871429443, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8397507667541504, + "num_tokens": 470210517.0, + "step": 12324 + }, + { + "epoch": 1.5678666836280373, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9899297952651978, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8515933156013489, + "num_tokens": 470247085.0, + "step": 12325 + }, + { + "epoch": 1.5679938939066278, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9094514846801758, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8628377914428711, + "num_tokens": 470288483.0, + "step": 12326 + }, + { + "epoch": 1.5681211041852183, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9745110273361206, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8640108108520508, + "num_tokens": 470322173.0, + "step": 12327 + }, + { + "epoch": 1.5682483144638086, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9880051612854004, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8637809753417969, + "num_tokens": 470356617.0, + "step": 12328 + }, + { + "epoch": 1.5683755247423992, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9767694473266602, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8732008934020996, + "num_tokens": 470393095.0, + "step": 12329 + }, + { + "epoch": 1.5685027350209897, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4867920875549316, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8739291429519653, + "num_tokens": 470427785.0, + "step": 12330 + }, + { + "epoch": 1.5686299452995802, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8482123613357544, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8581142425537109, + "num_tokens": 470469971.0, + "step": 12331 + }, + { + "epoch": 1.5687571555781707, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0080528259277344, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8535259962081909, + "num_tokens": 470503935.0, + "step": 12332 + }, + { + "epoch": 1.5688843658567613, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9295703172683716, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8649343848228455, + "num_tokens": 470536014.0, + "step": 12333 + }, + { + "epoch": 1.5690115761353516, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.84123694896698, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8768746852874756, + "num_tokens": 470575632.0, + "step": 12334 + }, + { + "epoch": 1.569138786413942, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7595545053482056, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8748733997344971, + "num_tokens": 470611295.0, + "step": 12335 + }, + { + "epoch": 1.5692659966925326, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.910146713256836, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8618017435073853, + "num_tokens": 470646993.0, + "step": 12336 + }, + { + "epoch": 1.5693932069711232, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1424012184143066, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8665916919708252, + "num_tokens": 470680253.0, + "step": 12337 + }, + { + "epoch": 1.5695204172497137, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9591208696365356, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8618987798690796, + "num_tokens": 470716169.0, + "step": 12338 + }, + { + "epoch": 1.5696476275283042, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8598294258117676, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8769277334213257, + "num_tokens": 470754208.0, + "step": 12339 + }, + { + "epoch": 1.5697748378068948, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.867383599281311, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.865075409412384, + "num_tokens": 470801329.0, + "step": 12340 + }, + { + "epoch": 1.5699020480854853, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0201926231384277, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8585845828056335, + "num_tokens": 470838408.0, + "step": 12341 + }, + { + "epoch": 1.5700292583640758, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8787649869918823, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8633599281311035, + "num_tokens": 470874001.0, + "step": 12342 + }, + { + "epoch": 1.5701564686426663, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7523711919784546, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8590282201766968, + "num_tokens": 470916870.0, + "step": 12343 + }, + { + "epoch": 1.5702836789212569, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9616328477859497, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8708152770996094, + "num_tokens": 470953862.0, + "step": 12344 + }, + { + "epoch": 1.5704108891998474, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.893547534942627, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8744797110557556, + "num_tokens": 470991507.0, + "step": 12345 + }, + { + "epoch": 1.570538099478438, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8999074697494507, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8664048910140991, + "num_tokens": 471030109.0, + "step": 12346 + }, + { + "epoch": 1.5706653097570284, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9797801971435547, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.87746262550354, + "num_tokens": 471063829.0, + "step": 12347 + }, + { + "epoch": 1.570792520035619, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9254337549209595, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8530920743942261, + "num_tokens": 471107259.0, + "step": 12348 + }, + { + "epoch": 1.5709197303142095, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9750621318817139, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8664395213127136, + "num_tokens": 471141516.0, + "step": 12349 + }, + { + "epoch": 1.5710469405928, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9389679431915283, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8641198873519897, + "num_tokens": 471178655.0, + "step": 12350 + }, + { + "epoch": 1.5711741508713906, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 80.52413940429688, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.875167965888977, + "num_tokens": 471209437.0, + "step": 12351 + }, + { + "epoch": 1.5713013611499809, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9851731061935425, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.878913938999176, + "num_tokens": 471249559.0, + "step": 12352 + }, + { + "epoch": 1.5714285714285714, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0858383178710938, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8679084777832031, + "num_tokens": 471287179.0, + "step": 12353 + }, + { + "epoch": 1.571555781707162, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9122394323349, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8669030666351318, + "num_tokens": 471328052.0, + "step": 12354 + }, + { + "epoch": 1.5716829919857525, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2090001106262207, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8615143895149231, + "num_tokens": 471362414.0, + "step": 12355 + }, + { + "epoch": 1.571810202264343, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.003037452697754, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.860170304775238, + "num_tokens": 471397910.0, + "step": 12356 + }, + { + "epoch": 1.5719374125429335, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8048512935638428, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8784903883934021, + "num_tokens": 471436951.0, + "step": 12357 + }, + { + "epoch": 1.5720646228215238, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9230624437332153, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.867571234703064, + "num_tokens": 471470139.0, + "step": 12358 + }, + { + "epoch": 1.5721918331001143, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9974815845489502, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8716709017753601, + "num_tokens": 471508591.0, + "step": 12359 + }, + { + "epoch": 1.5723190433787049, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.7866578102111816, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8639439344406128, + "num_tokens": 471549165.0, + "step": 12360 + }, + { + "epoch": 1.5724462536572954, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9580118656158447, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8571781516075134, + "num_tokens": 471589021.0, + "step": 12361 + }, + { + "epoch": 1.572573463935886, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 20.595977783203125, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8581559658050537, + "num_tokens": 471624263.0, + "step": 12362 + }, + { + "epoch": 1.5727006742144765, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0732715129852295, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8582712411880493, + "num_tokens": 471663678.0, + "step": 12363 + }, + { + "epoch": 1.572827884493067, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9756672382354736, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8628990650177002, + "num_tokens": 471705887.0, + "step": 12364 + }, + { + "epoch": 1.5729550947716575, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8620939254760742, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8713306188583374, + "num_tokens": 471739796.0, + "step": 12365 + }, + { + "epoch": 1.573082305050248, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2572052478790283, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8609206676483154, + "num_tokens": 471774723.0, + "step": 12366 + }, + { + "epoch": 1.5732095153288386, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 7.767268180847168, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8570588827133179, + "num_tokens": 471814169.0, + "step": 12367 + }, + { + "epoch": 1.573336725607429, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0561203956604004, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8656284809112549, + "num_tokens": 471852367.0, + "step": 12368 + }, + { + "epoch": 1.5734639358860196, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9634695053100586, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8699548244476318, + "num_tokens": 471883438.0, + "step": 12369 + }, + { + "epoch": 1.5735911461646102, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7901984453201294, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8617952466011047, + "num_tokens": 471921051.0, + "step": 12370 + }, + { + "epoch": 1.5737183564432007, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8275479078292847, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8673352003097534, + "num_tokens": 471959685.0, + "step": 12371 + }, + { + "epoch": 1.5738455667217912, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8096128702163696, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8636711835861206, + "num_tokens": 471997345.0, + "step": 12372 + }, + { + "epoch": 1.5739727770003817, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.787502646446228, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.869732141494751, + "num_tokens": 472034455.0, + "step": 12373 + }, + { + "epoch": 1.5740999872789723, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7840750217437744, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8631848692893982, + "num_tokens": 472073630.0, + "step": 12374 + }, + { + "epoch": 1.5742271975575628, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8318511247634888, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8618999123573303, + "num_tokens": 472111556.0, + "step": 12375 + }, + { + "epoch": 1.5743544078361533, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8377488851547241, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8714652061462402, + "num_tokens": 472148276.0, + "step": 12376 + }, + { + "epoch": 1.5744816181147436, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.788025140762329, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8637065291404724, + "num_tokens": 472181665.0, + "step": 12377 + }, + { + "epoch": 1.5746088283933342, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0427510738372803, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8576258420944214, + "num_tokens": 472222759.0, + "step": 12378 + }, + { + "epoch": 1.5747360386719247, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 7.7746500968933105, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8587159514427185, + "num_tokens": 472260867.0, + "step": 12379 + }, + { + "epoch": 1.5748632489505152, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.036322593688965, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8522858619689941, + "num_tokens": 472300068.0, + "step": 12380 + }, + { + "epoch": 1.5749904592291057, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.940405249595642, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8748587369918823, + "num_tokens": 472336459.0, + "step": 12381 + }, + { + "epoch": 1.5751176695076963, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7913331985473633, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8689023852348328, + "num_tokens": 472376696.0, + "step": 12382 + }, + { + "epoch": 1.5752448797862866, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.796895146369934, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8775995373725891, + "num_tokens": 472414491.0, + "step": 12383 + }, + { + "epoch": 1.575372090064877, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.880253791809082, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8635380268096924, + "num_tokens": 472452188.0, + "step": 12384 + }, + { + "epoch": 1.5754993003434676, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1943893432617188, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8635108470916748, + "num_tokens": 472489690.0, + "step": 12385 + }, + { + "epoch": 1.5756265106220582, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7980378866195679, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8631041049957275, + "num_tokens": 472529912.0, + "step": 12386 + }, + { + "epoch": 1.5757537209006487, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8976927995681763, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8626856803894043, + "num_tokens": 472562534.0, + "step": 12387 + }, + { + "epoch": 1.5758809311792392, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.748138904571533, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8632645606994629, + "num_tokens": 472598955.0, + "step": 12388 + }, + { + "epoch": 1.5760081414578297, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7597546577453613, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8681934475898743, + "num_tokens": 472638064.0, + "step": 12389 + }, + { + "epoch": 1.5761353517364203, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8584294319152832, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8675389885902405, + "num_tokens": 472676823.0, + "step": 12390 + }, + { + "epoch": 1.5762625620150108, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7543957233428955, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8637734651565552, + "num_tokens": 472718528.0, + "step": 12391 + }, + { + "epoch": 1.5763897722936013, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8738123178482056, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8701801300048828, + "num_tokens": 472755634.0, + "step": 12392 + }, + { + "epoch": 1.5765169825721919, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9715603590011597, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8769885301589966, + "num_tokens": 472798060.0, + "step": 12393 + }, + { + "epoch": 1.5766441928507824, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.829832673072815, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8697092533111572, + "num_tokens": 472837628.0, + "step": 12394 + }, + { + "epoch": 1.576771403129373, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8853745460510254, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.865454912185669, + "num_tokens": 472877446.0, + "step": 12395 + }, + { + "epoch": 1.5768986134079634, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8984858989715576, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8592175245285034, + "num_tokens": 472918437.0, + "step": 12396 + }, + { + "epoch": 1.577025823686554, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7877814769744873, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8731443285942078, + "num_tokens": 472956242.0, + "step": 12397 + }, + { + "epoch": 1.5771530339651445, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.000157356262207, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.874076247215271, + "num_tokens": 472985620.0, + "step": 12398 + }, + { + "epoch": 1.577280244243735, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.087311029434204, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8689463138580322, + "num_tokens": 473019517.0, + "step": 12399 + }, + { + "epoch": 1.5774074545223256, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.040198802947998, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8838306665420532, + "num_tokens": 473054624.0, + "step": 12400 + }, + { + "epoch": 1.5775346648009159, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.9435174465179443, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8555994033813477, + "num_tokens": 473094140.0, + "step": 12401 + }, + { + "epoch": 1.5776618750795064, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.954206109046936, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8473173379898071, + "num_tokens": 473136886.0, + "step": 12402 + }, + { + "epoch": 1.577789085358097, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9871761798858643, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8516983985900879, + "num_tokens": 473173267.0, + "step": 12403 + }, + { + "epoch": 1.5779162956366874, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8383451700210571, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8435114622116089, + "num_tokens": 473216477.0, + "step": 12404 + }, + { + "epoch": 1.578043505915278, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9323811531066895, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.856436014175415, + "num_tokens": 473256986.0, + "step": 12405 + }, + { + "epoch": 1.5781707161938685, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8641984462738037, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8716768622398376, + "num_tokens": 473297967.0, + "step": 12406 + }, + { + "epoch": 1.5782979264724588, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 3.2801530361175537, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8592133522033691, + "num_tokens": 473333580.0, + "step": 12407 + }, + { + "epoch": 1.5784251367510493, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.537644386291504, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8692001700401306, + "num_tokens": 473370553.0, + "step": 12408 + }, + { + "epoch": 1.5785523470296399, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0282857418060303, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8465568423271179, + "num_tokens": 473410705.0, + "step": 12409 + }, + { + "epoch": 1.5786795573082304, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0299854278564453, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8681625127792358, + "num_tokens": 473443315.0, + "step": 12410 + }, + { + "epoch": 1.578806767586821, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.875287652015686, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8564308881759644, + "num_tokens": 473488767.0, + "step": 12411 + }, + { + "epoch": 1.5789339778654115, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.949264645576477, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8796395063400269, + "num_tokens": 473525360.0, + "step": 12412 + }, + { + "epoch": 1.579061188144002, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.011772394180298, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8548907041549683, + "num_tokens": 473560625.0, + "step": 12413 + }, + { + "epoch": 1.5791883984225925, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8589469194412231, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8826284408569336, + "num_tokens": 473595484.0, + "step": 12414 + }, + { + "epoch": 1.579315608701183, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0660881996154785, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8621202111244202, + "num_tokens": 473627269.0, + "step": 12415 + }, + { + "epoch": 1.5794428189797736, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8224860429763794, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8488219380378723, + "num_tokens": 473669159.0, + "step": 12416 + }, + { + "epoch": 1.579570029258364, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7785617113113403, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8741708993911743, + "num_tokens": 473707739.0, + "step": 12417 + }, + { + "epoch": 1.5796972395369546, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8603533506393433, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8648343682289124, + "num_tokens": 473746908.0, + "step": 12418 + }, + { + "epoch": 1.5798244498155452, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.939404845237732, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8631152510643005, + "num_tokens": 473781480.0, + "step": 12419 + }, + { + "epoch": 1.5799516600941357, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9411582946777344, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8639379143714905, + "num_tokens": 473814676.0, + "step": 12420 + }, + { + "epoch": 1.5800788703727262, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.911219835281372, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8595532178878784, + "num_tokens": 473855487.0, + "step": 12421 + }, + { + "epoch": 1.5802060806513167, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.02616548538208, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8765310049057007, + "num_tokens": 473885648.0, + "step": 12422 + }, + { + "epoch": 1.5803332909299073, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9954397678375244, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.860063910484314, + "num_tokens": 473923862.0, + "step": 12423 + }, + { + "epoch": 1.5804605012084978, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.998249888420105, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8672648668289185, + "num_tokens": 473957725.0, + "step": 12424 + }, + { + "epoch": 1.5805877114870883, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8540959358215332, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8642667531967163, + "num_tokens": 473994939.0, + "step": 12425 + }, + { + "epoch": 1.5807149217656786, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8825619220733643, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8600208759307861, + "num_tokens": 474035996.0, + "step": 12426 + }, + { + "epoch": 1.5808421320442692, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.862923502922058, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8720041513442993, + "num_tokens": 474071693.0, + "step": 12427 + }, + { + "epoch": 1.5809693423228597, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9428105354309082, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8708668351173401, + "num_tokens": 474114415.0, + "step": 12428 + }, + { + "epoch": 1.5810965526014502, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0538828372955322, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8588658571243286, + "num_tokens": 474152060.0, + "step": 12429 + }, + { + "epoch": 1.5812237628800407, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8769850730895996, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8664511442184448, + "num_tokens": 474190605.0, + "step": 12430 + }, + { + "epoch": 1.5813509731586313, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.025442600250244, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8601503968238831, + "num_tokens": 474229447.0, + "step": 12431 + }, + { + "epoch": 1.5814781834372216, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8460968732833862, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8724182844161987, + "num_tokens": 474268088.0, + "step": 12432 + }, + { + "epoch": 1.581605393715812, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0286526679992676, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8599359393119812, + "num_tokens": 474304856.0, + "step": 12433 + }, + { + "epoch": 1.5817326039944026, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7602871656417847, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8680627346038818, + "num_tokens": 474349248.0, + "step": 12434 + }, + { + "epoch": 1.5818598142729932, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8037869930267334, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8753838539123535, + "num_tokens": 474384432.0, + "step": 12435 + }, + { + "epoch": 1.5819870245515837, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9718421697616577, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8618006110191345, + "num_tokens": 474415881.0, + "step": 12436 + }, + { + "epoch": 1.5821142348301742, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8839439153671265, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8640528917312622, + "num_tokens": 474452755.0, + "step": 12437 + }, + { + "epoch": 1.5822414451087647, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8339216709136963, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8599145412445068, + "num_tokens": 474493937.0, + "step": 12438 + }, + { + "epoch": 1.5823686553873553, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7962671518325806, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8705047369003296, + "num_tokens": 474534072.0, + "step": 12439 + }, + { + "epoch": 1.5824958656659458, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.956327199935913, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8563404679298401, + "num_tokens": 474569343.0, + "step": 12440 + }, + { + "epoch": 1.5826230759445363, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8456166982650757, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8544024229049683, + "num_tokens": 474606610.0, + "step": 12441 + }, + { + "epoch": 1.5827502862231269, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9014015197753906, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8547931909561157, + "num_tokens": 474645071.0, + "step": 12442 + }, + { + "epoch": 1.5828774965017174, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7443040609359741, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8533209562301636, + "num_tokens": 474683985.0, + "step": 12443 + }, + { + "epoch": 1.583004706780308, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9695848226547241, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8693304061889648, + "num_tokens": 474726102.0, + "step": 12444 + }, + { + "epoch": 1.5831319170588984, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.818777322769165, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8673077821731567, + "num_tokens": 474766430.0, + "step": 12445 + }, + { + "epoch": 1.583259127337489, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0751426219940186, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8635867238044739, + "num_tokens": 474800648.0, + "step": 12446 + }, + { + "epoch": 1.5833863376160795, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7192872762680054, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8683780431747437, + "num_tokens": 474845540.0, + "step": 12447 + }, + { + "epoch": 1.58351354789467, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9175735712051392, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8618230819702148, + "num_tokens": 474884296.0, + "step": 12448 + }, + { + "epoch": 1.5836407581732606, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8064123392105103, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8691755533218384, + "num_tokens": 474929344.0, + "step": 12449 + }, + { + "epoch": 1.5837679684518509, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.048792839050293, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8697315454483032, + "num_tokens": 474968035.0, + "step": 12450 + }, + { + "epoch": 1.5838951787304414, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9118824005126953, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.858826220035553, + "num_tokens": 475006189.0, + "step": 12451 + }, + { + "epoch": 1.584022389009032, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8534690141677856, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8592283725738525, + "num_tokens": 475039768.0, + "step": 12452 + }, + { + "epoch": 1.5841495992876224, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9282069206237793, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8697550892829895, + "num_tokens": 475079938.0, + "step": 12453 + }, + { + "epoch": 1.584276809566213, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7798800468444824, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8508078455924988, + "num_tokens": 475122405.0, + "step": 12454 + }, + { + "epoch": 1.5844040198448035, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8680720329284668, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8573965430259705, + "num_tokens": 475158237.0, + "step": 12455 + }, + { + "epoch": 1.5845312301233938, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8532936573028564, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8654563426971436, + "num_tokens": 475194966.0, + "step": 12456 + }, + { + "epoch": 1.5846584404019843, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.870270013809204, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8628641366958618, + "num_tokens": 475234483.0, + "step": 12457 + }, + { + "epoch": 1.5847856506805749, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8080964088439941, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.859187662601471, + "num_tokens": 475275172.0, + "step": 12458 + }, + { + "epoch": 1.5849128609591654, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8111333847045898, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.867605447769165, + "num_tokens": 475313550.0, + "step": 12459 + }, + { + "epoch": 1.585040071237756, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.837409257888794, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8720329403877258, + "num_tokens": 475357019.0, + "step": 12460 + }, + { + "epoch": 1.5851672815163464, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0742034912109375, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8682687878608704, + "num_tokens": 475394788.0, + "step": 12461 + }, + { + "epoch": 1.585294491794937, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9814889430999756, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8622901439666748, + "num_tokens": 475432354.0, + "step": 12462 + }, + { + "epoch": 1.5854217020735275, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9250446557998657, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8679298162460327, + "num_tokens": 475467284.0, + "step": 12463 + }, + { + "epoch": 1.585548912352118, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.89592707157135, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8556303381919861, + "num_tokens": 475505946.0, + "step": 12464 + }, + { + "epoch": 1.5856761226307086, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9723154306411743, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.851264476776123, + "num_tokens": 475542799.0, + "step": 12465 + }, + { + "epoch": 1.585803332909299, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9463545083999634, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8648369312286377, + "num_tokens": 475582064.0, + "step": 12466 + }, + { + "epoch": 1.5859305431878896, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9447287321090698, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8601891994476318, + "num_tokens": 475615521.0, + "step": 12467 + }, + { + "epoch": 1.5860577534664801, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9072760343551636, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8722124099731445, + "num_tokens": 475650500.0, + "step": 12468 + }, + { + "epoch": 1.5861849637450707, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9021565914154053, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8653706908226013, + "num_tokens": 475695824.0, + "step": 12469 + }, + { + "epoch": 1.5863121740236612, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.889746904373169, + "learning_rate": 1e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8421388864517212, + "num_tokens": 475734015.0, + "step": 12470 + }, + { + "epoch": 1.5864393843022517, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.818418025970459, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8617318868637085, + "num_tokens": 475768961.0, + "step": 12471 + }, + { + "epoch": 1.5865665945808423, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9190555810928345, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8797361850738525, + "num_tokens": 475806551.0, + "step": 12472 + }, + { + "epoch": 1.5866938048594328, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9179892539978027, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8583946228027344, + "num_tokens": 475844066.0, + "step": 12473 + }, + { + "epoch": 1.5868210151380233, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9921892881393433, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8684688806533813, + "num_tokens": 475874554.0, + "step": 12474 + }, + { + "epoch": 1.5869482254166136, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0630249977111816, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8707808256149292, + "num_tokens": 475915840.0, + "step": 12475 + }, + { + "epoch": 1.5870754356952042, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8854907751083374, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8630880117416382, + "num_tokens": 475951516.0, + "step": 12476 + }, + { + "epoch": 1.5872026459737947, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.01784086227417, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8671817779541016, + "num_tokens": 475984224.0, + "step": 12477 + }, + { + "epoch": 1.5873298562523852, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0554921627044678, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8428570032119751, + "num_tokens": 476015758.0, + "step": 12478 + }, + { + "epoch": 1.5874570665309757, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0721402168273926, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8606545329093933, + "num_tokens": 476051835.0, + "step": 12479 + }, + { + "epoch": 1.5875842768095663, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9474376440048218, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8603006601333618, + "num_tokens": 476090667.0, + "step": 12480 + }, + { + "epoch": 1.5877114870881566, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9155077934265137, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8618052005767822, + "num_tokens": 476130448.0, + "step": 12481 + }, + { + "epoch": 1.587838697366747, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.802650809288025, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.854081392288208, + "num_tokens": 476174495.0, + "step": 12482 + }, + { + "epoch": 1.5879659076453376, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9531010389328003, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8725023865699768, + "num_tokens": 476213412.0, + "step": 12483 + }, + { + "epoch": 1.5880931179239282, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9368029832839966, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8772552013397217, + "num_tokens": 476250441.0, + "step": 12484 + }, + { + "epoch": 1.5882203282025187, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0603182315826416, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.877632737159729, + "num_tokens": 476284466.0, + "step": 12485 + }, + { + "epoch": 1.5883475384811092, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.837759256362915, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8627808094024658, + "num_tokens": 476327760.0, + "step": 12486 + }, + { + "epoch": 1.5884747487596997, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.771714448928833, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8597657084465027, + "num_tokens": 476369259.0, + "step": 12487 + }, + { + "epoch": 1.5886019590382903, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.886479377746582, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8656699657440186, + "num_tokens": 476403915.0, + "step": 12488 + }, + { + "epoch": 1.5887291693168808, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9105726480484009, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8614950180053711, + "num_tokens": 476440837.0, + "step": 12489 + }, + { + "epoch": 1.5888563795954713, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8459933996200562, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8622062802314758, + "num_tokens": 476478478.0, + "step": 12490 + }, + { + "epoch": 1.5889835898740619, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8612196445465088, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8731138110160828, + "num_tokens": 476514675.0, + "step": 12491 + }, + { + "epoch": 1.5891108001526524, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9462794065475464, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.860335648059845, + "num_tokens": 476559015.0, + "step": 12492 + }, + { + "epoch": 1.589238010431243, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9179019927978516, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.848639965057373, + "num_tokens": 476601848.0, + "step": 12493 + }, + { + "epoch": 1.5893652207098334, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8700571060180664, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8722405433654785, + "num_tokens": 476636794.0, + "step": 12494 + }, + { + "epoch": 1.589492430988424, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.018765687942505, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8579778075218201, + "num_tokens": 476671593.0, + "step": 12495 + }, + { + "epoch": 1.5896196412670145, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8203034400939941, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.874151349067688, + "num_tokens": 476706776.0, + "step": 12496 + }, + { + "epoch": 1.589746851545605, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0660784244537354, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8626919984817505, + "num_tokens": 476746700.0, + "step": 12497 + }, + { + "epoch": 1.5898740618241956, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.029784679412842, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8691017627716064, + "num_tokens": 476785837.0, + "step": 12498 + }, + { + "epoch": 1.5900012721027859, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9497859477996826, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8698327541351318, + "num_tokens": 476818894.0, + "step": 12499 + }, + { + "epoch": 1.5901284823813764, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8205814361572266, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8464750051498413, + "num_tokens": 476861773.0, + "step": 12500 + }, + { + "epoch": 1.590255692659967, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9067323207855225, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8690066337585449, + "num_tokens": 476899522.0, + "step": 12501 + }, + { + "epoch": 1.5903829029385574, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9526543617248535, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8568931818008423, + "num_tokens": 476939923.0, + "step": 12502 + }, + { + "epoch": 1.590510113217148, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8870588541030884, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8638091087341309, + "num_tokens": 476979867.0, + "step": 12503 + }, + { + "epoch": 1.5906373234957385, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9559608697891235, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8721488118171692, + "num_tokens": 477016068.0, + "step": 12504 + }, + { + "epoch": 1.5907645337743288, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.033210277557373, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8613210916519165, + "num_tokens": 477054526.0, + "step": 12505 + }, + { + "epoch": 1.5908917440529193, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8873018026351929, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8586646914482117, + "num_tokens": 477098029.0, + "step": 12506 + }, + { + "epoch": 1.5910189543315099, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0405917167663574, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.869736909866333, + "num_tokens": 477137070.0, + "step": 12507 + }, + { + "epoch": 1.5911461646101004, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8837649822235107, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8629487156867981, + "num_tokens": 477175640.0, + "step": 12508 + }, + { + "epoch": 1.591273374888691, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8884690999984741, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8661008477210999, + "num_tokens": 477210935.0, + "step": 12509 + }, + { + "epoch": 1.5914005851672814, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9569969177246094, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8713232278823853, + "num_tokens": 477251567.0, + "step": 12510 + }, + { + "epoch": 1.591527795445872, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1566975116729736, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8673234581947327, + "num_tokens": 477287964.0, + "step": 12511 + }, + { + "epoch": 1.5916550057244625, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8715295791625977, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8396583795547485, + "num_tokens": 477328660.0, + "step": 12512 + }, + { + "epoch": 1.591782216003053, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.878852367401123, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8679145574569702, + "num_tokens": 477364111.0, + "step": 12513 + }, + { + "epoch": 1.5919094262816436, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0345942974090576, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8710418939590454, + "num_tokens": 477402691.0, + "step": 12514 + }, + { + "epoch": 1.592036636560234, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8743469715118408, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8773678541183472, + "num_tokens": 477437526.0, + "step": 12515 + }, + { + "epoch": 1.5921638468388246, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9526721239089966, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8618484735488892, + "num_tokens": 477479175.0, + "step": 12516 + }, + { + "epoch": 1.5922910571174151, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9891310930252075, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8662778735160828, + "num_tokens": 477516896.0, + "step": 12517 + }, + { + "epoch": 1.5924182673960057, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0051755905151367, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8541632294654846, + "num_tokens": 477555543.0, + "step": 12518 + }, + { + "epoch": 1.5925454776745962, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.932694673538208, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8767502307891846, + "num_tokens": 477592251.0, + "step": 12519 + }, + { + "epoch": 1.5926726879531867, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.061271905899048, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8506871461868286, + "num_tokens": 477630039.0, + "step": 12520 + }, + { + "epoch": 1.5927998982317773, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0618491172790527, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8629611730575562, + "num_tokens": 477661028.0, + "step": 12521 + }, + { + "epoch": 1.5929271085103678, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9086350202560425, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8773513436317444, + "num_tokens": 477699612.0, + "step": 12522 + }, + { + "epoch": 1.5930543187889583, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0879745483398438, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8593488335609436, + "num_tokens": 477737300.0, + "step": 12523 + }, + { + "epoch": 1.5931815290675486, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9756417274475098, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8518847227096558, + "num_tokens": 477776031.0, + "step": 12524 + }, + { + "epoch": 1.5933087393461391, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9097013473510742, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8651372790336609, + "num_tokens": 477813640.0, + "step": 12525 + }, + { + "epoch": 1.5934359496247297, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.926859974861145, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8600825071334839, + "num_tokens": 477848652.0, + "step": 12526 + }, + { + "epoch": 1.5935631599033202, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9598214626312256, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8603267073631287, + "num_tokens": 477883710.0, + "step": 12527 + }, + { + "epoch": 1.5936903701819107, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0406134128570557, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8567644357681274, + "num_tokens": 477920316.0, + "step": 12528 + }, + { + "epoch": 1.5938175804605013, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.920677900314331, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8730851411819458, + "num_tokens": 477955523.0, + "step": 12529 + }, + { + "epoch": 1.5939447907390916, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.982940912246704, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8550781607627869, + "num_tokens": 477995768.0, + "step": 12530 + }, + { + "epoch": 1.594072001017682, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9587351083755493, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8631695508956909, + "num_tokens": 478037718.0, + "step": 12531 + }, + { + "epoch": 1.5941992112962726, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8141549825668335, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8516124486923218, + "num_tokens": 478077388.0, + "step": 12532 + }, + { + "epoch": 1.5943264215748632, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8676677942276, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8783067464828491, + "num_tokens": 478109494.0, + "step": 12533 + }, + { + "epoch": 1.5944536318534537, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.119112014770508, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8598138093948364, + "num_tokens": 478142420.0, + "step": 12534 + }, + { + "epoch": 1.5945808421320442, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9683150053024292, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8678004145622253, + "num_tokens": 478183848.0, + "step": 12535 + }, + { + "epoch": 1.5947080524106347, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9092330932617188, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8727383613586426, + "num_tokens": 478225242.0, + "step": 12536 + }, + { + "epoch": 1.5948352626892253, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8847811222076416, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8700066804885864, + "num_tokens": 478263167.0, + "step": 12537 + }, + { + "epoch": 1.5949624729678158, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0715932846069336, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8650954365730286, + "num_tokens": 478297752.0, + "step": 12538 + }, + { + "epoch": 1.5950896832464063, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.931133508682251, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8656904101371765, + "num_tokens": 478332955.0, + "step": 12539 + }, + { + "epoch": 1.5952168935249968, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.997039556503296, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8419137001037598, + "num_tokens": 478367298.0, + "step": 12540 + }, + { + "epoch": 1.5953441038035874, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7203844785690308, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8682307004928589, + "num_tokens": 478411183.0, + "step": 12541 + }, + { + "epoch": 1.595471314082178, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8913545608520508, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8609161972999573, + "num_tokens": 478451184.0, + "step": 12542 + }, + { + "epoch": 1.5955985243607684, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8343702554702759, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8591858148574829, + "num_tokens": 478490913.0, + "step": 12543 + }, + { + "epoch": 1.595725734639359, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.763007640838623, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8650892376899719, + "num_tokens": 478529769.0, + "step": 12544 + }, + { + "epoch": 1.5958529449179495, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.910457730293274, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8598665595054626, + "num_tokens": 478566665.0, + "step": 12545 + }, + { + "epoch": 1.59598015519654, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7520781755447388, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8917043805122375, + "num_tokens": 478600814.0, + "step": 12546 + }, + { + "epoch": 1.5961073654751305, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8750686645507812, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8685129880905151, + "num_tokens": 478641610.0, + "step": 12547 + }, + { + "epoch": 1.5962345757537209, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.823045015335083, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8718347549438477, + "num_tokens": 478678183.0, + "step": 12548 + }, + { + "epoch": 1.5963617860323114, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0587377548217773, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8640799522399902, + "num_tokens": 478710881.0, + "step": 12549 + }, + { + "epoch": 1.596488996310902, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9043726921081543, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.877363920211792, + "num_tokens": 478748550.0, + "step": 12550 + }, + { + "epoch": 1.5966162065894924, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8885480165481567, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8528463840484619, + "num_tokens": 478790417.0, + "step": 12551 + }, + { + "epoch": 1.596743416868083, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9472761154174805, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8707802295684814, + "num_tokens": 478828856.0, + "step": 12552 + }, + { + "epoch": 1.5968706271466735, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.920197606086731, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8615274429321289, + "num_tokens": 478870240.0, + "step": 12553 + }, + { + "epoch": 1.5969978374252638, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3864097595214844, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.855514407157898, + "num_tokens": 478911920.0, + "step": 12554 + }, + { + "epoch": 1.5971250477038543, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.174304485321045, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8575741648674011, + "num_tokens": 478949083.0, + "step": 12555 + }, + { + "epoch": 1.5972522579824449, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.974730134010315, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8630855083465576, + "num_tokens": 478984087.0, + "step": 12556 + }, + { + "epoch": 1.5973794682610354, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0402491092681885, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8443241119384766, + "num_tokens": 479019364.0, + "step": 12557 + }, + { + "epoch": 1.597506678539626, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.925475001335144, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8661452531814575, + "num_tokens": 479059002.0, + "step": 12558 + }, + { + "epoch": 1.5976338888182164, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2138593196868896, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8638089895248413, + "num_tokens": 479092676.0, + "step": 12559 + }, + { + "epoch": 1.597761099096807, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.952046513557434, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8678733706474304, + "num_tokens": 479125951.0, + "step": 12560 + }, + { + "epoch": 1.5978883093753975, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7962191104888916, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8808741569519043, + "num_tokens": 479166230.0, + "step": 12561 + }, + { + "epoch": 1.598015519653988, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.892298698425293, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8593153953552246, + "num_tokens": 479205813.0, + "step": 12562 + }, + { + "epoch": 1.5981427299325786, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9606660604476929, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.860052764415741, + "num_tokens": 479244960.0, + "step": 12563 + }, + { + "epoch": 1.598269940211169, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.713261365890503, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8696721792221069, + "num_tokens": 479289617.0, + "step": 12564 + }, + { + "epoch": 1.5983971504897596, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.396711826324463, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8633447885513306, + "num_tokens": 479326377.0, + "step": 12565 + }, + { + "epoch": 1.5985243607683501, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.069859504699707, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8626425266265869, + "num_tokens": 479360563.0, + "step": 12566 + }, + { + "epoch": 1.5986515710469407, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8333101272583008, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.875224232673645, + "num_tokens": 479403042.0, + "step": 12567 + }, + { + "epoch": 1.5987787813255312, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8777395486831665, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.871268630027771, + "num_tokens": 479437099.0, + "step": 12568 + }, + { + "epoch": 1.5989059916041217, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9799104928970337, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8622494339942932, + "num_tokens": 479479326.0, + "step": 12569 + }, + { + "epoch": 1.5990332018827123, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9704432487487793, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8446658253669739, + "num_tokens": 479515320.0, + "step": 12570 + }, + { + "epoch": 1.5991604121613028, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8858261108398438, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8524520397186279, + "num_tokens": 479559960.0, + "step": 12571 + }, + { + "epoch": 1.5992876224398933, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.245356321334839, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8637593984603882, + "num_tokens": 479599115.0, + "step": 12572 + }, + { + "epoch": 1.5994148327184836, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9415977001190186, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8723783493041992, + "num_tokens": 479636116.0, + "step": 12573 + }, + { + "epoch": 1.5995420429970741, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8577690124511719, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8555411100387573, + "num_tokens": 479676880.0, + "step": 12574 + }, + { + "epoch": 1.5996692532756647, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8336650133132935, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8644652366638184, + "num_tokens": 479717424.0, + "step": 12575 + }, + { + "epoch": 1.5997964635542552, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9370919466018677, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8628047704696655, + "num_tokens": 479756240.0, + "step": 12576 + }, + { + "epoch": 1.5999236738328457, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0372729301452637, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8500993847846985, + "num_tokens": 479791863.0, + "step": 12577 + }, + { + "epoch": 1.6000508841114363, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.359954595565796, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8730047345161438, + "num_tokens": 479827252.0, + "step": 12578 + }, + { + "epoch": 1.6001780943900266, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9630235433578491, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8611770868301392, + "num_tokens": 479863100.0, + "step": 12579 + }, + { + "epoch": 1.600305304668617, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9298855066299438, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8525416851043701, + "num_tokens": 479899684.0, + "step": 12580 + }, + { + "epoch": 1.6004325149472076, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8208891153335571, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8649827837944031, + "num_tokens": 479937656.0, + "step": 12581 + }, + { + "epoch": 1.6005597252257981, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1617019176483154, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8574495315551758, + "num_tokens": 479973357.0, + "step": 12582 + }, + { + "epoch": 1.6006869355043887, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7619935274124146, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8645466566085815, + "num_tokens": 480017862.0, + "step": 12583 + }, + { + "epoch": 1.6008141457829792, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8329975605010986, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8690425157546997, + "num_tokens": 480057039.0, + "step": 12584 + }, + { + "epoch": 1.6009413560615697, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.031357526779175, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.868133544921875, + "num_tokens": 480093396.0, + "step": 12585 + }, + { + "epoch": 1.6010685663401603, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2120046615600586, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8604360818862915, + "num_tokens": 480125200.0, + "step": 12586 + }, + { + "epoch": 1.6011957766187508, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.013031244277954, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8594632148742676, + "num_tokens": 480158085.0, + "step": 12587 + }, + { + "epoch": 1.6013229868973413, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9610674381256104, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8791730403900146, + "num_tokens": 480192979.0, + "step": 12588 + }, + { + "epoch": 1.6014501971759318, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8516334295272827, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8754087686538696, + "num_tokens": 480227984.0, + "step": 12589 + }, + { + "epoch": 1.6015774074545224, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9566289186477661, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8809614181518555, + "num_tokens": 480264280.0, + "step": 12590 + }, + { + "epoch": 1.601704617733113, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1583752632141113, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8487541675567627, + "num_tokens": 480300670.0, + "step": 12591 + }, + { + "epoch": 1.6018318280117034, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.047647714614868, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8630678057670593, + "num_tokens": 480334460.0, + "step": 12592 + }, + { + "epoch": 1.601959038290294, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9568032026290894, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8735090494155884, + "num_tokens": 480376066.0, + "step": 12593 + }, + { + "epoch": 1.6020862485688845, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7638486623764038, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8707084655761719, + "num_tokens": 480417575.0, + "step": 12594 + }, + { + "epoch": 1.602213458847475, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8769532442092896, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8547144532203674, + "num_tokens": 480454106.0, + "step": 12595 + }, + { + "epoch": 1.6023406691260655, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9393811225891113, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8515106439590454, + "num_tokens": 480493031.0, + "step": 12596 + }, + { + "epoch": 1.6024678794046558, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8467910289764404, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8646405935287476, + "num_tokens": 480528326.0, + "step": 12597 + }, + { + "epoch": 1.6025950896832464, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8579256534576416, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8705018758773804, + "num_tokens": 480565399.0, + "step": 12598 + }, + { + "epoch": 1.602722299961837, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.4043095111846924, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8789088726043701, + "num_tokens": 480600028.0, + "step": 12599 + }, + { + "epoch": 1.6028495102404274, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0075509548187256, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8638367056846619, + "num_tokens": 480635759.0, + "step": 12600 + }, + { + "epoch": 1.602976720519018, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0920393466949463, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.869992733001709, + "num_tokens": 480669528.0, + "step": 12601 + }, + { + "epoch": 1.6031039307976085, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8931769132614136, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8608062863349915, + "num_tokens": 480709425.0, + "step": 12602 + }, + { + "epoch": 1.6032311410761988, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1927292346954346, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8495017290115356, + "num_tokens": 480745010.0, + "step": 12603 + }, + { + "epoch": 1.6033583513547893, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9366683959960938, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8702512979507446, + "num_tokens": 480780320.0, + "step": 12604 + }, + { + "epoch": 1.6034855616333799, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9321949481964111, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.870134711265564, + "num_tokens": 480819335.0, + "step": 12605 + }, + { + "epoch": 1.6036127719119704, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9349844455718994, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8763671517372131, + "num_tokens": 480852702.0, + "step": 12606 + }, + { + "epoch": 1.603739982190561, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0742204189300537, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8581327199935913, + "num_tokens": 480890258.0, + "step": 12607 + }, + { + "epoch": 1.6038671924691514, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8696337938308716, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8640651702880859, + "num_tokens": 480929555.0, + "step": 12608 + }, + { + "epoch": 1.603994402747742, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7024420499801636, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8868429064750671, + "num_tokens": 480970295.0, + "step": 12609 + }, + { + "epoch": 1.6041216130263325, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3838610649108887, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.865799069404602, + "num_tokens": 481011213.0, + "step": 12610 + }, + { + "epoch": 1.604248823304923, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8605096340179443, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.859038233757019, + "num_tokens": 481045273.0, + "step": 12611 + }, + { + "epoch": 1.6043760335835135, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8394758701324463, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8442264795303345, + "num_tokens": 481084567.0, + "step": 12612 + }, + { + "epoch": 1.604503243862104, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8508009910583496, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8649945259094238, + "num_tokens": 481119455.0, + "step": 12613 + }, + { + "epoch": 1.6046304541406946, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9696447849273682, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.849707841873169, + "num_tokens": 481154718.0, + "step": 12614 + }, + { + "epoch": 1.6047576644192851, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7827324867248535, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8586865663528442, + "num_tokens": 481193716.0, + "step": 12615 + }, + { + "epoch": 1.6048848746978757, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9992966651916504, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8407612442970276, + "num_tokens": 481231378.0, + "step": 12616 + }, + { + "epoch": 1.6050120849764662, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9688050746917725, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8621139526367188, + "num_tokens": 481267907.0, + "step": 12617 + }, + { + "epoch": 1.6051392952550567, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8296536207199097, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8649247884750366, + "num_tokens": 481309899.0, + "step": 12618 + }, + { + "epoch": 1.6052665055336472, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8938120603561401, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8731949925422668, + "num_tokens": 481348845.0, + "step": 12619 + }, + { + "epoch": 1.6053937158122378, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.925218939781189, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8549606800079346, + "num_tokens": 481384999.0, + "step": 12620 + }, + { + "epoch": 1.6055209260908283, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9711096286773682, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8666068315505981, + "num_tokens": 481417287.0, + "step": 12621 + }, + { + "epoch": 1.6056481363694186, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.947020173072815, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8684046268463135, + "num_tokens": 481455943.0, + "step": 12622 + }, + { + "epoch": 1.6057753466480091, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9306014776229858, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8790560364723206, + "num_tokens": 481492017.0, + "step": 12623 + }, + { + "epoch": 1.6059025569265997, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8016541004180908, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8565382361412048, + "num_tokens": 481535115.0, + "step": 12624 + }, + { + "epoch": 1.6060297672051902, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7575792074203491, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8489853143692017, + "num_tokens": 481580048.0, + "step": 12625 + }, + { + "epoch": 1.6061569774837807, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9383792877197266, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8748840093612671, + "num_tokens": 481616748.0, + "step": 12626 + }, + { + "epoch": 1.6062841877623713, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.8132710456848145, + "learning_rate": 1e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8350775241851807, + "num_tokens": 481659123.0, + "step": 12627 + }, + { + "epoch": 1.6064113980409616, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9173818826675415, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8542772531509399, + "num_tokens": 481704791.0, + "step": 12628 + }, + { + "epoch": 1.606538608319552, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9915598630905151, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8514226675033569, + "num_tokens": 481742913.0, + "step": 12629 + }, + { + "epoch": 1.6066658185981426, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.870794653892517, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8508330583572388, + "num_tokens": 481783549.0, + "step": 12630 + }, + { + "epoch": 1.6067930288767331, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9243143796920776, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8693960905075073, + "num_tokens": 481818486.0, + "step": 12631 + }, + { + "epoch": 1.6069202391553237, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9634290933609009, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.863590657711029, + "num_tokens": 481849969.0, + "step": 12632 + }, + { + "epoch": 1.6070474494339142, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1323812007904053, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.847390353679657, + "num_tokens": 481886108.0, + "step": 12633 + }, + { + "epoch": 1.6071746597125047, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9173569679260254, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8757773637771606, + "num_tokens": 481919389.0, + "step": 12634 + }, + { + "epoch": 1.6073018699910953, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.933261752128601, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8768681883811951, + "num_tokens": 481958531.0, + "step": 12635 + }, + { + "epoch": 1.6074290802696858, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7872549295425415, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8582642078399658, + "num_tokens": 481999864.0, + "step": 12636 + }, + { + "epoch": 1.6075562905482763, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8860797882080078, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8868993520736694, + "num_tokens": 482033890.0, + "step": 12637 + }, + { + "epoch": 1.6076835008268668, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8329980373382568, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8689125776290894, + "num_tokens": 482071186.0, + "step": 12638 + }, + { + "epoch": 1.6078107111054574, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8819024562835693, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8620549440383911, + "num_tokens": 482107218.0, + "step": 12639 + }, + { + "epoch": 1.607937921384048, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0708675384521484, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8600724935531616, + "num_tokens": 482146101.0, + "step": 12640 + }, + { + "epoch": 1.6080651316626384, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9968812465667725, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8777443170547485, + "num_tokens": 482178531.0, + "step": 12641 + }, + { + "epoch": 1.608192341941229, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.319624900817871, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8719525337219238, + "num_tokens": 482217019.0, + "step": 12642 + }, + { + "epoch": 1.6083195522198195, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9670361280441284, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8653658032417297, + "num_tokens": 482250547.0, + "step": 12643 + }, + { + "epoch": 1.60844676249841, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8617603778839111, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8597779273986816, + "num_tokens": 482287009.0, + "step": 12644 + }, + { + "epoch": 1.6085739727770005, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.844240427017212, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8511085510253906, + "num_tokens": 482327373.0, + "step": 12645 + }, + { + "epoch": 1.6087011830555908, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9442663192749023, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.852742075920105, + "num_tokens": 482363409.0, + "step": 12646 + }, + { + "epoch": 1.6088283933341814, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1566667556762695, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8536142706871033, + "num_tokens": 482396499.0, + "step": 12647 + }, + { + "epoch": 1.608955603612772, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.804453730583191, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.865445077419281, + "num_tokens": 482435059.0, + "step": 12648 + }, + { + "epoch": 1.6090828138913624, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8825435638427734, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8552919626235962, + "num_tokens": 482472793.0, + "step": 12649 + }, + { + "epoch": 1.609210024169953, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.112428903579712, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8649555444717407, + "num_tokens": 482508857.0, + "step": 12650 + }, + { + "epoch": 1.6093372344485435, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8696125745773315, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8483030200004578, + "num_tokens": 482548526.0, + "step": 12651 + }, + { + "epoch": 1.6094644447271338, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9833701848983765, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8553264141082764, + "num_tokens": 482586461.0, + "step": 12652 + }, + { + "epoch": 1.6095916550057243, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.00252628326416, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8562286496162415, + "num_tokens": 482624642.0, + "step": 12653 + }, + { + "epoch": 1.6097188652843148, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3734853267669678, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8626031875610352, + "num_tokens": 482664096.0, + "step": 12654 + }, + { + "epoch": 1.6098460755629054, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9260470867156982, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8748158812522888, + "num_tokens": 482701230.0, + "step": 12655 + }, + { + "epoch": 1.609973285841496, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8471826314926147, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8644959926605225, + "num_tokens": 482741611.0, + "step": 12656 + }, + { + "epoch": 1.6101004961200864, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.826675534248352, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8744953274726868, + "num_tokens": 482779482.0, + "step": 12657 + }, + { + "epoch": 1.610227706398677, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8870761394500732, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8732688426971436, + "num_tokens": 482816279.0, + "step": 12658 + }, + { + "epoch": 1.6103549166772675, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7738127708435059, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8546524047851562, + "num_tokens": 482859941.0, + "step": 12659 + }, + { + "epoch": 1.610482126955858, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8370871543884277, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8679021596908569, + "num_tokens": 482894348.0, + "step": 12660 + }, + { + "epoch": 1.6106093372344485, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.921622395515442, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8587244153022766, + "num_tokens": 482930882.0, + "step": 12661 + }, + { + "epoch": 1.610736547513039, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9620065689086914, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8468340635299683, + "num_tokens": 482969515.0, + "step": 12662 + }, + { + "epoch": 1.6108637577916296, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.891637921333313, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.859652042388916, + "num_tokens": 483008074.0, + "step": 12663 + }, + { + "epoch": 1.6109909680702201, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.193650245666504, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8559770584106445, + "num_tokens": 483038920.0, + "step": 12664 + }, + { + "epoch": 1.6111181783488107, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9208407402038574, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8734205961227417, + "num_tokens": 483075559.0, + "step": 12665 + }, + { + "epoch": 1.6112453886274012, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.010277509689331, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.86773681640625, + "num_tokens": 483114378.0, + "step": 12666 + }, + { + "epoch": 1.6113725989059917, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2048490047454834, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.868651807308197, + "num_tokens": 483147911.0, + "step": 12667 + }, + { + "epoch": 1.6114998091845822, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8815407752990723, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8780046105384827, + "num_tokens": 483188275.0, + "step": 12668 + }, + { + "epoch": 1.6116270194631728, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8457446098327637, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8761711120605469, + "num_tokens": 483230804.0, + "step": 12669 + }, + { + "epoch": 1.6117542297417633, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1504757404327393, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8674516677856445, + "num_tokens": 483262368.0, + "step": 12670 + }, + { + "epoch": 1.6118814400203536, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9592171907424927, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8518240451812744, + "num_tokens": 483300894.0, + "step": 12671 + }, + { + "epoch": 1.6120086502989441, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.940171241760254, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8708683252334595, + "num_tokens": 483337377.0, + "step": 12672 + }, + { + "epoch": 1.6121358605775347, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9191274642944336, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8637304306030273, + "num_tokens": 483376572.0, + "step": 12673 + }, + { + "epoch": 1.6122630708561252, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9299291372299194, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8511750102043152, + "num_tokens": 483415584.0, + "step": 12674 + }, + { + "epoch": 1.6123902811347157, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7367204427719116, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8641235828399658, + "num_tokens": 483459584.0, + "step": 12675 + }, + { + "epoch": 1.6125174914133062, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 4.0251545906066895, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8578507900238037, + "num_tokens": 483504260.0, + "step": 12676 + }, + { + "epoch": 1.6126447016918966, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0870981216430664, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8672335147857666, + "num_tokens": 483547271.0, + "step": 12677 + }, + { + "epoch": 1.612771911970487, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1759555339813232, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8676043748855591, + "num_tokens": 483582255.0, + "step": 12678 + }, + { + "epoch": 1.6128991222490776, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9107723236083984, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8591984510421753, + "num_tokens": 483622290.0, + "step": 12679 + }, + { + "epoch": 1.6130263325276681, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9522173404693604, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8764690160751343, + "num_tokens": 483657253.0, + "step": 12680 + }, + { + "epoch": 1.6131535428062587, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8891050815582275, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8687193989753723, + "num_tokens": 483693062.0, + "step": 12681 + }, + { + "epoch": 1.6132807530848492, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7801845073699951, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8590612411499023, + "num_tokens": 483735845.0, + "step": 12682 + }, + { + "epoch": 1.6134079633634397, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7821098566055298, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8679029941558838, + "num_tokens": 483783833.0, + "step": 12683 + }, + { + "epoch": 1.6135351736420303, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8925364017486572, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8630570769309998, + "num_tokens": 483820175.0, + "step": 12684 + }, + { + "epoch": 1.6136623839206208, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.901810884475708, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.874735951423645, + "num_tokens": 483863454.0, + "step": 12685 + }, + { + "epoch": 1.6137895941992113, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.878396987915039, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8838762640953064, + "num_tokens": 483900657.0, + "step": 12686 + }, + { + "epoch": 1.6139168044778018, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.984955906867981, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8579680919647217, + "num_tokens": 483935912.0, + "step": 12687 + }, + { + "epoch": 1.6140440147563924, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0120716094970703, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8650794625282288, + "num_tokens": 483968594.0, + "step": 12688 + }, + { + "epoch": 1.614171225034983, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7390766143798828, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.873546838760376, + "num_tokens": 484010866.0, + "step": 12689 + }, + { + "epoch": 1.6142984353135734, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8066953420639038, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8597466349601746, + "num_tokens": 484051355.0, + "step": 12690 + }, + { + "epoch": 1.614425645592164, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.85312020778656, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8746650218963623, + "num_tokens": 484087049.0, + "step": 12691 + }, + { + "epoch": 1.6145528558707545, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3455121517181396, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8532609939575195, + "num_tokens": 484124914.0, + "step": 12692 + }, + { + "epoch": 1.614680066149345, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1368181705474854, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8806251287460327, + "num_tokens": 484154524.0, + "step": 12693 + }, + { + "epoch": 1.6148072764279355, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8471252918243408, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8632482290267944, + "num_tokens": 484195091.0, + "step": 12694 + }, + { + "epoch": 1.6149344867065258, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.882559895515442, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8665183782577515, + "num_tokens": 484236787.0, + "step": 12695 + }, + { + "epoch": 1.6150616969851164, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.696974277496338, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8600517511367798, + "num_tokens": 484281748.0, + "step": 12696 + }, + { + "epoch": 1.615188907263707, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8285623788833618, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8715090751647949, + "num_tokens": 484321181.0, + "step": 12697 + }, + { + "epoch": 1.6153161175422974, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9706223011016846, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8627285957336426, + "num_tokens": 484360476.0, + "step": 12698 + }, + { + "epoch": 1.615443327820888, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9316115379333496, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8651286363601685, + "num_tokens": 484395065.0, + "step": 12699 + }, + { + "epoch": 1.6155705380994785, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7975664138793945, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8662121295928955, + "num_tokens": 484439309.0, + "step": 12700 + }, + { + "epoch": 1.6156977483780688, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0205819606781006, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8645018935203552, + "num_tokens": 484473018.0, + "step": 12701 + }, + { + "epoch": 1.6158249586566593, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9681179523468018, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8711794018745422, + "num_tokens": 484518306.0, + "step": 12702 + }, + { + "epoch": 1.6159521689352498, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9047237634658813, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8504185676574707, + "num_tokens": 484556348.0, + "step": 12703 + }, + { + "epoch": 1.6160793792138404, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0359480381011963, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8748653531074524, + "num_tokens": 484593136.0, + "step": 12704 + }, + { + "epoch": 1.616206589492431, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3171334266662598, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8736284971237183, + "num_tokens": 484628025.0, + "step": 12705 + }, + { + "epoch": 1.6163337997710214, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8770899772644043, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.87054842710495, + "num_tokens": 484666703.0, + "step": 12706 + }, + { + "epoch": 1.616461010049612, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8035337924957275, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8728874921798706, + "num_tokens": 484703417.0, + "step": 12707 + }, + { + "epoch": 1.6165882203282025, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8738698959350586, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8790057897567749, + "num_tokens": 484741830.0, + "step": 12708 + }, + { + "epoch": 1.616715430606793, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8843096494674683, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8639692068099976, + "num_tokens": 484780140.0, + "step": 12709 + }, + { + "epoch": 1.6168426408853835, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9503813982009888, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.863181471824646, + "num_tokens": 484813856.0, + "step": 12710 + }, + { + "epoch": 1.616969851163974, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8256479501724243, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8746376037597656, + "num_tokens": 484851383.0, + "step": 12711 + }, + { + "epoch": 1.6170970614425646, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1051688194274902, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.874346137046814, + "num_tokens": 484884361.0, + "step": 12712 + }, + { + "epoch": 1.6172242717211551, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9408416748046875, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8452172875404358, + "num_tokens": 484922106.0, + "step": 12713 + }, + { + "epoch": 1.6173514819997457, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0261788368225098, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.865318775177002, + "num_tokens": 484963100.0, + "step": 12714 + }, + { + "epoch": 1.6174786922783362, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8222601413726807, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8753136992454529, + "num_tokens": 485006896.0, + "step": 12715 + }, + { + "epoch": 1.6176059025569267, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7745256423950195, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8650703430175781, + "num_tokens": 485045968.0, + "step": 12716 + }, + { + "epoch": 1.6177331128355172, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7858171463012695, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8631477355957031, + "num_tokens": 485084736.0, + "step": 12717 + }, + { + "epoch": 1.6178603231141078, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9896684885025024, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8716318607330322, + "num_tokens": 485123130.0, + "step": 12718 + }, + { + "epoch": 1.6179875333926983, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.019639253616333, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.862032413482666, + "num_tokens": 485158072.0, + "step": 12719 + }, + { + "epoch": 1.6181147436712886, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9975935220718384, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8731008768081665, + "num_tokens": 485196783.0, + "step": 12720 + }, + { + "epoch": 1.6182419539498791, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8872591257095337, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8705289363861084, + "num_tokens": 485231433.0, + "step": 12721 + }, + { + "epoch": 1.6183691642284697, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.962626576423645, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8628448247909546, + "num_tokens": 485269016.0, + "step": 12722 + }, + { + "epoch": 1.6184963745070602, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.085501194000244, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8541367053985596, + "num_tokens": 485308293.0, + "step": 12723 + }, + { + "epoch": 1.6186235847856507, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9362127780914307, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8749351501464844, + "num_tokens": 485340758.0, + "step": 12724 + }, + { + "epoch": 1.6187507950642412, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7938669919967651, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8791016340255737, + "num_tokens": 485384047.0, + "step": 12725 + }, + { + "epoch": 1.6188780053428315, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.082049608230591, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8680282831192017, + "num_tokens": 485416909.0, + "step": 12726 + }, + { + "epoch": 1.619005215621422, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0713186264038086, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8646044731140137, + "num_tokens": 485449532.0, + "step": 12727 + }, + { + "epoch": 1.6191324259000126, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7873238325119019, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8754799962043762, + "num_tokens": 485491171.0, + "step": 12728 + }, + { + "epoch": 1.6192596361786031, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9516195058822632, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8613443374633789, + "num_tokens": 485525511.0, + "step": 12729 + }, + { + "epoch": 1.6193868464571937, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.561558246612549, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8614621162414551, + "num_tokens": 485562517.0, + "step": 12730 + }, + { + "epoch": 1.6195140567357842, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8994070291519165, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8682392239570618, + "num_tokens": 485601058.0, + "step": 12731 + }, + { + "epoch": 1.6196412670143747, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9706472158432007, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8790457844734192, + "num_tokens": 485634737.0, + "step": 12732 + }, + { + "epoch": 1.6197684772929652, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8809711933135986, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8684335947036743, + "num_tokens": 485671300.0, + "step": 12733 + }, + { + "epoch": 1.6198956875715558, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8979477882385254, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8726165890693665, + "num_tokens": 485708712.0, + "step": 12734 + }, + { + "epoch": 1.6200228978501463, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0339555740356445, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8767828345298767, + "num_tokens": 485746820.0, + "step": 12735 + }, + { + "epoch": 1.6201501081287368, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8130792379379272, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8661834001541138, + "num_tokens": 485785921.0, + "step": 12736 + }, + { + "epoch": 1.6202773184073274, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8364753723144531, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8656389713287354, + "num_tokens": 485825766.0, + "step": 12737 + }, + { + "epoch": 1.6204045286859179, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8577337265014648, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8590283989906311, + "num_tokens": 485862288.0, + "step": 12738 + }, + { + "epoch": 1.6205317389645084, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7683607339859009, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8598713278770447, + "num_tokens": 485903582.0, + "step": 12739 + }, + { + "epoch": 1.620658949243099, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7829328775405884, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8623406887054443, + "num_tokens": 485950931.0, + "step": 12740 + }, + { + "epoch": 1.6207861595216895, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8662159442901611, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.86728435754776, + "num_tokens": 485984185.0, + "step": 12741 + }, + { + "epoch": 1.62091336980028, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.047990322113037, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8714893460273743, + "num_tokens": 486018796.0, + "step": 12742 + }, + { + "epoch": 1.6210405800788705, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9696905612945557, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.851988673210144, + "num_tokens": 486055328.0, + "step": 12743 + }, + { + "epoch": 1.6211677903574608, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8233354091644287, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8601861000061035, + "num_tokens": 486092340.0, + "step": 12744 + }, + { + "epoch": 1.6212950006360514, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.081663131713867, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8528963923454285, + "num_tokens": 486123195.0, + "step": 12745 + }, + { + "epoch": 1.621422210914642, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8623292446136475, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8751091957092285, + "num_tokens": 486167145.0, + "step": 12746 + }, + { + "epoch": 1.6215494211932324, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.4938783645629883, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8578997850418091, + "num_tokens": 486207925.0, + "step": 12747 + }, + { + "epoch": 1.621676631471823, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2066378593444824, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8696404695510864, + "num_tokens": 486245446.0, + "step": 12748 + }, + { + "epoch": 1.6218038417504135, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9148305654525757, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8656189441680908, + "num_tokens": 486280461.0, + "step": 12749 + }, + { + "epoch": 1.6219310520290038, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8876242637634277, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8532098531723022, + "num_tokens": 486323926.0, + "step": 12750 + }, + { + "epoch": 1.6220582623075943, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.868825078010559, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8578729629516602, + "num_tokens": 486360016.0, + "step": 12751 + }, + { + "epoch": 1.6221854725861848, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0977225303649902, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8694323897361755, + "num_tokens": 486394994.0, + "step": 12752 + }, + { + "epoch": 1.6223126828647754, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7789653539657593, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8641608953475952, + "num_tokens": 486436909.0, + "step": 12753 + }, + { + "epoch": 1.622439893143366, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9573051929473877, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.871585488319397, + "num_tokens": 486473201.0, + "step": 12754 + }, + { + "epoch": 1.6225671034219564, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0221011638641357, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8464103937149048, + "num_tokens": 486510733.0, + "step": 12755 + }, + { + "epoch": 1.622694313700547, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8669582605361938, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.880062460899353, + "num_tokens": 486546666.0, + "step": 12756 + }, + { + "epoch": 1.6228215239791375, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3742966651916504, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8460346460342407, + "num_tokens": 486585639.0, + "step": 12757 + }, + { + "epoch": 1.622948734257728, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8458056449890137, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8698694705963135, + "num_tokens": 486623028.0, + "step": 12758 + }, + { + "epoch": 1.6230759445363185, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9868528842926025, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8603872060775757, + "num_tokens": 486655906.0, + "step": 12759 + }, + { + "epoch": 1.623203154814909, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9909124374389648, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8621654510498047, + "num_tokens": 486689461.0, + "step": 12760 + }, + { + "epoch": 1.6233303650934996, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8350573778152466, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8488678336143494, + "num_tokens": 486728373.0, + "step": 12761 + }, + { + "epoch": 1.6234575753720901, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7665563821792603, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8752067685127258, + "num_tokens": 486769955.0, + "step": 12762 + }, + { + "epoch": 1.6235847856506807, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.104177713394165, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8697443008422852, + "num_tokens": 486806468.0, + "step": 12763 + }, + { + "epoch": 1.6237119959292712, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.208876848220825, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8513100743293762, + "num_tokens": 486846526.0, + "step": 12764 + }, + { + "epoch": 1.6238392062078617, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8338412046432495, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8813409209251404, + "num_tokens": 486886323.0, + "step": 12765 + }, + { + "epoch": 1.6239664164864522, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9508519172668457, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8459151983261108, + "num_tokens": 486918099.0, + "step": 12766 + }, + { + "epoch": 1.6240936267650428, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.865256905555725, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8790585994720459, + "num_tokens": 486953789.0, + "step": 12767 + }, + { + "epoch": 1.6242208370436333, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8392332792282104, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8643322587013245, + "num_tokens": 486996165.0, + "step": 12768 + }, + { + "epoch": 1.6243480473222236, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1227850914001465, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.876248836517334, + "num_tokens": 487028249.0, + "step": 12769 + }, + { + "epoch": 1.6244752576008141, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.782165288925171, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8672007322311401, + "num_tokens": 487070099.0, + "step": 12770 + }, + { + "epoch": 1.6246024678794047, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8506982326507568, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.869074821472168, + "num_tokens": 487107117.0, + "step": 12771 + }, + { + "epoch": 1.6247296781579952, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8397732973098755, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8534063100814819, + "num_tokens": 487146024.0, + "step": 12772 + }, + { + "epoch": 1.6248568884365857, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.215872287750244, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.851746678352356, + "num_tokens": 487189864.0, + "step": 12773 + }, + { + "epoch": 1.6249840987151762, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4311656951904297, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8662892580032349, + "num_tokens": 487230307.0, + "step": 12774 + }, + { + "epoch": 1.6251113089937665, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1505508422851562, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8680731058120728, + "num_tokens": 487261387.0, + "step": 12775 + }, + { + "epoch": 1.625238519272357, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9701995849609375, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8544608354568481, + "num_tokens": 487298611.0, + "step": 12776 + }, + { + "epoch": 1.6253657295509476, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8815996646881104, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.864270031452179, + "num_tokens": 487338172.0, + "step": 12777 + }, + { + "epoch": 1.6254929398295381, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9709724187850952, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8670030832290649, + "num_tokens": 487375126.0, + "step": 12778 + }, + { + "epoch": 1.6256201501081287, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8645507097244263, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8760913610458374, + "num_tokens": 487416539.0, + "step": 12779 + }, + { + "epoch": 1.6257473603867192, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9173542261123657, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8667083978652954, + "num_tokens": 487448429.0, + "step": 12780 + }, + { + "epoch": 1.6258745706653097, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8948757648468018, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8637913465499878, + "num_tokens": 487486054.0, + "step": 12781 + }, + { + "epoch": 1.6260017809439002, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.040060520172119, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8711152076721191, + "num_tokens": 487523771.0, + "step": 12782 + }, + { + "epoch": 1.6261289912224908, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1728694438934326, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8707457780838013, + "num_tokens": 487561334.0, + "step": 12783 + }, + { + "epoch": 1.6262562015010813, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7594902515411377, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8660271167755127, + "num_tokens": 487606382.0, + "step": 12784 + }, + { + "epoch": 1.6263834117796718, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7746977806091309, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8612558841705322, + "num_tokens": 487647640.0, + "step": 12785 + }, + { + "epoch": 1.6265106220582624, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8586174249649048, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8624814748764038, + "num_tokens": 487682604.0, + "step": 12786 + }, + { + "epoch": 1.6266378323368529, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0362040996551514, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8677265644073486, + "num_tokens": 487718271.0, + "step": 12787 + }, + { + "epoch": 1.6267650426154434, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1276743412017822, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8518502712249756, + "num_tokens": 487752872.0, + "step": 12788 + }, + { + "epoch": 1.626892252894034, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.86642324924469, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8560100793838501, + "num_tokens": 487798309.0, + "step": 12789 + }, + { + "epoch": 1.6270194631726245, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8947670459747314, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8544489741325378, + "num_tokens": 487834052.0, + "step": 12790 + }, + { + "epoch": 1.627146673451215, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1425516605377197, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8535484671592712, + "num_tokens": 487868662.0, + "step": 12791 + }, + { + "epoch": 1.6272738837298055, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9241985082626343, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.862307071685791, + "num_tokens": 487907425.0, + "step": 12792 + }, + { + "epoch": 1.6274010940083958, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.925068736076355, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8723962306976318, + "num_tokens": 487942979.0, + "step": 12793 + }, + { + "epoch": 1.6275283042869864, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9572718143463135, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8692704439163208, + "num_tokens": 487982402.0, + "step": 12794 + }, + { + "epoch": 1.6276555145655769, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9117921590805054, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.872377872467041, + "num_tokens": 488020071.0, + "step": 12795 + }, + { + "epoch": 1.6277827248441674, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9366835355758667, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8515785932540894, + "num_tokens": 488057457.0, + "step": 12796 + }, + { + "epoch": 1.627909935122758, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0004780292510986, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.860858678817749, + "num_tokens": 488094002.0, + "step": 12797 + }, + { + "epoch": 1.6280371454013485, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8653497695922852, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8498914837837219, + "num_tokens": 488134995.0, + "step": 12798 + }, + { + "epoch": 1.6281643556799388, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8718783855438232, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8678851127624512, + "num_tokens": 488169942.0, + "step": 12799 + }, + { + "epoch": 1.6282915659585293, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9580434560775757, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8671234250068665, + "num_tokens": 488210852.0, + "step": 12800 + }, + { + "epoch": 1.6284187762371198, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8110891580581665, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.868099570274353, + "num_tokens": 488253067.0, + "step": 12801 + }, + { + "epoch": 1.6285459865157104, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8775874376296997, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8732484579086304, + "num_tokens": 488293498.0, + "step": 12802 + }, + { + "epoch": 1.628673196794301, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8236147165298462, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8680272698402405, + "num_tokens": 488333931.0, + "step": 12803 + }, + { + "epoch": 1.6288004070728914, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9401445388793945, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8699221014976501, + "num_tokens": 488371724.0, + "step": 12804 + }, + { + "epoch": 1.628927617351482, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8837080001831055, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8597084283828735, + "num_tokens": 488415171.0, + "step": 12805 + }, + { + "epoch": 1.6290548276300725, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.064307689666748, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8593146800994873, + "num_tokens": 488451672.0, + "step": 12806 + }, + { + "epoch": 1.629182037908663, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.82760488986969, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8788918256759644, + "num_tokens": 488492657.0, + "step": 12807 + }, + { + "epoch": 1.6293092481872535, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7802844047546387, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8716717958450317, + "num_tokens": 488530681.0, + "step": 12808 + }, + { + "epoch": 1.629436458465844, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9412347078323364, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8743200302124023, + "num_tokens": 488560542.0, + "step": 12809 + }, + { + "epoch": 1.6295636687444346, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9929836988449097, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8668385744094849, + "num_tokens": 488597491.0, + "step": 12810 + }, + { + "epoch": 1.6296908790230251, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.066058397293091, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8722996711730957, + "num_tokens": 488634556.0, + "step": 12811 + }, + { + "epoch": 1.6298180893016156, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9442555904388428, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8621600866317749, + "num_tokens": 488668174.0, + "step": 12812 + }, + { + "epoch": 1.6299452995802062, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8753550052642822, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8693500757217407, + "num_tokens": 488705994.0, + "step": 12813 + }, + { + "epoch": 1.6300725098587967, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.959696650505066, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8554912805557251, + "num_tokens": 488746908.0, + "step": 12814 + }, + { + "epoch": 1.6301997201373872, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8143887519836426, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8658486604690552, + "num_tokens": 488785607.0, + "step": 12815 + }, + { + "epoch": 1.6303269304159778, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9839800596237183, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8637708425521851, + "num_tokens": 488823831.0, + "step": 12816 + }, + { + "epoch": 1.630454140694568, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.839540719985962, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8716647028923035, + "num_tokens": 488859944.0, + "step": 12817 + }, + { + "epoch": 1.6305813509731586, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7563657760620117, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8501595258712769, + "num_tokens": 488903764.0, + "step": 12818 + }, + { + "epoch": 1.6307085612517491, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7999999523162842, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8555363416671753, + "num_tokens": 488946890.0, + "step": 12819 + }, + { + "epoch": 1.6308357715303397, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9316693544387817, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8625813722610474, + "num_tokens": 488981901.0, + "step": 12820 + }, + { + "epoch": 1.6309629818089302, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7279183864593506, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8685351610183716, + "num_tokens": 489024180.0, + "step": 12821 + }, + { + "epoch": 1.6310901920875207, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7717316150665283, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.868149995803833, + "num_tokens": 489070971.0, + "step": 12822 + }, + { + "epoch": 1.6312174023661112, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.041120767593384, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8752516508102417, + "num_tokens": 489111636.0, + "step": 12823 + }, + { + "epoch": 1.6313446126447015, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.84757399559021, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8629615902900696, + "num_tokens": 489148677.0, + "step": 12824 + }, + { + "epoch": 1.631471822923292, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9416905641555786, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8532325029373169, + "num_tokens": 489186222.0, + "step": 12825 + }, + { + "epoch": 1.6315990332018826, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0418872833251953, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8386783003807068, + "num_tokens": 489223173.0, + "step": 12826 + }, + { + "epoch": 1.6317262434804731, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9480022192001343, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8657761216163635, + "num_tokens": 489259236.0, + "step": 12827 + }, + { + "epoch": 1.6318534537590637, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9246138334274292, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8570215702056885, + "num_tokens": 489298449.0, + "step": 12828 + }, + { + "epoch": 1.6319806640376542, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1059539318084717, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8439064025878906, + "num_tokens": 489334800.0, + "step": 12829 + }, + { + "epoch": 1.6321078743162447, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7378605604171753, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8710629343986511, + "num_tokens": 489378510.0, + "step": 12830 + }, + { + "epoch": 1.6322350845948352, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2048838138580322, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8529378771781921, + "num_tokens": 489419267.0, + "step": 12831 + }, + { + "epoch": 1.6323622948734258, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2019104957580566, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8682037591934204, + "num_tokens": 489454772.0, + "step": 12832 + }, + { + "epoch": 1.6324895051520163, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9695032835006714, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8582881689071655, + "num_tokens": 489490146.0, + "step": 12833 + }, + { + "epoch": 1.6326167154306068, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8860152959823608, + "learning_rate": 1e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8361918926239014, + "num_tokens": 489532754.0, + "step": 12834 + }, + { + "epoch": 1.6327439257091974, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8965433835983276, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8607200384140015, + "num_tokens": 489570378.0, + "step": 12835 + }, + { + "epoch": 1.6328711359877879, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.166046142578125, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8561317324638367, + "num_tokens": 489604562.0, + "step": 12836 + }, + { + "epoch": 1.6329983462663784, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9774783849716187, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8712139129638672, + "num_tokens": 489644060.0, + "step": 12837 + }, + { + "epoch": 1.633125556544969, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.861684799194336, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8650734424591064, + "num_tokens": 489682693.0, + "step": 12838 + }, + { + "epoch": 1.6332527668235595, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.987017273902893, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.873221755027771, + "num_tokens": 489720786.0, + "step": 12839 + }, + { + "epoch": 1.63337997710215, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7353127002716064, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.870297908782959, + "num_tokens": 489758773.0, + "step": 12840 + }, + { + "epoch": 1.6335071873807405, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8920797109603882, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8662572503089905, + "num_tokens": 489793436.0, + "step": 12841 + }, + { + "epoch": 1.6336343976593308, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9625260829925537, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8847050070762634, + "num_tokens": 489828634.0, + "step": 12842 + }, + { + "epoch": 1.6337616079379214, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9701544046401978, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8688240647315979, + "num_tokens": 489866335.0, + "step": 12843 + }, + { + "epoch": 1.6338888182165119, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.814895749092102, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8732629418373108, + "num_tokens": 489903059.0, + "step": 12844 + }, + { + "epoch": 1.6340160284951024, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8284369707107544, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.871493399143219, + "num_tokens": 489940186.0, + "step": 12845 + }, + { + "epoch": 1.634143238773693, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7600481510162354, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.876166582107544, + "num_tokens": 489984732.0, + "step": 12846 + }, + { + "epoch": 1.6342704490522835, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.947245478630066, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8664733171463013, + "num_tokens": 490016699.0, + "step": 12847 + }, + { + "epoch": 1.6343976593308738, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8762495517730713, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8584423065185547, + "num_tokens": 490051351.0, + "step": 12848 + }, + { + "epoch": 1.6345248696094643, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8447420597076416, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8737662434577942, + "num_tokens": 490089668.0, + "step": 12849 + }, + { + "epoch": 1.6346520798880548, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8173907995224, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8657598495483398, + "num_tokens": 490129651.0, + "step": 12850 + }, + { + "epoch": 1.6347792901666454, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8036006689071655, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.857421338558197, + "num_tokens": 490168441.0, + "step": 12851 + }, + { + "epoch": 1.6349065004452359, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8155391216278076, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8741523623466492, + "num_tokens": 490204754.0, + "step": 12852 + }, + { + "epoch": 1.6350337107238264, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8713308572769165, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.879665732383728, + "num_tokens": 490244178.0, + "step": 12853 + }, + { + "epoch": 1.635160921002417, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8830647468566895, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8609778881072998, + "num_tokens": 490281161.0, + "step": 12854 + }, + { + "epoch": 1.6352881312810075, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.011474370956421, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8649362325668335, + "num_tokens": 490314892.0, + "step": 12855 + }, + { + "epoch": 1.635415341559598, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9808560609817505, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8536362648010254, + "num_tokens": 490352600.0, + "step": 12856 + }, + { + "epoch": 1.6355425518381885, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4269087314605713, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8760510683059692, + "num_tokens": 490384487.0, + "step": 12857 + }, + { + "epoch": 1.635669762116779, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.177488088607788, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8449209928512573, + "num_tokens": 490422199.0, + "step": 12858 + }, + { + "epoch": 1.6357969723953696, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0821444988250732, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8572337627410889, + "num_tokens": 490458058.0, + "step": 12859 + }, + { + "epoch": 1.6359241826739601, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8032798767089844, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8771268129348755, + "num_tokens": 490498447.0, + "step": 12860 + }, + { + "epoch": 1.6360513929525506, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8536732196807861, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8520384430885315, + "num_tokens": 490538502.0, + "step": 12861 + }, + { + "epoch": 1.6361786032311412, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.789690375328064, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8694402575492859, + "num_tokens": 490575752.0, + "step": 12862 + }, + { + "epoch": 1.6363058135097317, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.027859687805176, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8652905821800232, + "num_tokens": 490613502.0, + "step": 12863 + }, + { + "epoch": 1.6364330237883222, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4100167751312256, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8590016961097717, + "num_tokens": 490653820.0, + "step": 12864 + }, + { + "epoch": 1.6365602340669128, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.012300968170166, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8501432538032532, + "num_tokens": 490693332.0, + "step": 12865 + }, + { + "epoch": 1.636687444345503, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9984925985336304, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.862807035446167, + "num_tokens": 490733135.0, + "step": 12866 + }, + { + "epoch": 1.6368146546240936, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9121731519699097, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8686567544937134, + "num_tokens": 490775505.0, + "step": 12867 + }, + { + "epoch": 1.6369418649026841, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8218421936035156, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.864896297454834, + "num_tokens": 490817783.0, + "step": 12868 + }, + { + "epoch": 1.6370690751812746, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0789170265197754, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8588457107543945, + "num_tokens": 490849464.0, + "step": 12869 + }, + { + "epoch": 1.6371962854598652, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8998137712478638, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8581470251083374, + "num_tokens": 490891925.0, + "step": 12870 + }, + { + "epoch": 1.6373234957384557, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0877206325531006, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8573329448699951, + "num_tokens": 490926221.0, + "step": 12871 + }, + { + "epoch": 1.6374507060170462, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9504481554031372, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8565551042556763, + "num_tokens": 490964141.0, + "step": 12872 + }, + { + "epoch": 1.6375779162956365, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.04896879196167, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8589315414428711, + "num_tokens": 490997917.0, + "step": 12873 + }, + { + "epoch": 1.637705126574227, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0136213302612305, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8804048299789429, + "num_tokens": 491029244.0, + "step": 12874 + }, + { + "epoch": 1.6378323368528176, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9293169975280762, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8639160394668579, + "num_tokens": 491066591.0, + "step": 12875 + }, + { + "epoch": 1.6379595471314081, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8842576742172241, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.863762378692627, + "num_tokens": 491106546.0, + "step": 12876 + }, + { + "epoch": 1.6380867574099987, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9575755596160889, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8573936223983765, + "num_tokens": 491145512.0, + "step": 12877 + }, + { + "epoch": 1.6382139676885892, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7889273166656494, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8633269667625427, + "num_tokens": 491185624.0, + "step": 12878 + }, + { + "epoch": 1.6383411779671797, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9031282663345337, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8589066863059998, + "num_tokens": 491220982.0, + "step": 12879 + }, + { + "epoch": 1.6384683882457702, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8416459560394287, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8655306696891785, + "num_tokens": 491262501.0, + "step": 12880 + }, + { + "epoch": 1.6385955985243608, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.751110553741455, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8754928708076477, + "num_tokens": 491298078.0, + "step": 12881 + }, + { + "epoch": 1.6387228088029513, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9493416547775269, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8704394102096558, + "num_tokens": 491336203.0, + "step": 12882 + }, + { + "epoch": 1.6388500190815418, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8733407258987427, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.853014349937439, + "num_tokens": 491374723.0, + "step": 12883 + }, + { + "epoch": 1.6389772293601323, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9496581554412842, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8665807843208313, + "num_tokens": 491410546.0, + "step": 12884 + }, + { + "epoch": 1.6391044396387229, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9804949760437012, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.861229419708252, + "num_tokens": 491450642.0, + "step": 12885 + }, + { + "epoch": 1.6392316499173134, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.881190538406372, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8609487414360046, + "num_tokens": 491489893.0, + "step": 12886 + }, + { + "epoch": 1.639358860195904, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.903401494026184, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8527020215988159, + "num_tokens": 491535870.0, + "step": 12887 + }, + { + "epoch": 1.6394860704744945, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9015913009643555, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8630975484848022, + "num_tokens": 491575988.0, + "step": 12888 + }, + { + "epoch": 1.639613280753085, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.151308536529541, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8670336604118347, + "num_tokens": 491616822.0, + "step": 12889 + }, + { + "epoch": 1.6397404910316755, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.909691333770752, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8836106657981873, + "num_tokens": 491654132.0, + "step": 12890 + }, + { + "epoch": 1.6398677013102658, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9301782846450806, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8519930839538574, + "num_tokens": 491697215.0, + "step": 12891 + }, + { + "epoch": 1.6399949115888564, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.195974826812744, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.84260094165802, + "num_tokens": 491731338.0, + "step": 12892 + }, + { + "epoch": 1.6401221218674469, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8803354501724243, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8626563549041748, + "num_tokens": 491770808.0, + "step": 12893 + }, + { + "epoch": 1.6402493321460374, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0113766193389893, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8717311024665833, + "num_tokens": 491804167.0, + "step": 12894 + }, + { + "epoch": 1.640376542424628, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0470428466796875, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8659131526947021, + "num_tokens": 491840094.0, + "step": 12895 + }, + { + "epoch": 1.6405037527032185, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.01468825340271, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8503296375274658, + "num_tokens": 491881643.0, + "step": 12896 + }, + { + "epoch": 1.6406309629818088, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8300894498825073, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8530305027961731, + "num_tokens": 491920951.0, + "step": 12897 + }, + { + "epoch": 1.6407581732603993, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9231613874435425, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8594356179237366, + "num_tokens": 491954599.0, + "step": 12898 + }, + { + "epoch": 1.6408853835389898, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8828564882278442, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.86562180519104, + "num_tokens": 491995375.0, + "step": 12899 + }, + { + "epoch": 1.6410125938175804, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7863848209381104, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8592915534973145, + "num_tokens": 492035297.0, + "step": 12900 + }, + { + "epoch": 1.6411398040961709, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.96638822555542, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.863883376121521, + "num_tokens": 492071003.0, + "step": 12901 + }, + { + "epoch": 1.6412670143747614, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.892646074295044, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.863325834274292, + "num_tokens": 492106046.0, + "step": 12902 + }, + { + "epoch": 1.641394224653352, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.84763503074646, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8767037987709045, + "num_tokens": 492147252.0, + "step": 12903 + }, + { + "epoch": 1.6415214349319425, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7922908067703247, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8542941808700562, + "num_tokens": 492188185.0, + "step": 12904 + }, + { + "epoch": 1.641648645210533, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9753150939941406, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.869002103805542, + "num_tokens": 492219630.0, + "step": 12905 + }, + { + "epoch": 1.6417758554891235, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0373692512512207, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8656805753707886, + "num_tokens": 492251955.0, + "step": 12906 + }, + { + "epoch": 1.641903065767714, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7671152353286743, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8681771755218506, + "num_tokens": 492293484.0, + "step": 12907 + }, + { + "epoch": 1.6420302760463046, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.078826427459717, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8500847816467285, + "num_tokens": 492331169.0, + "step": 12908 + }, + { + "epoch": 1.642157486324895, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8984413146972656, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8716683983802795, + "num_tokens": 492366071.0, + "step": 12909 + }, + { + "epoch": 1.6422846966034856, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1661953926086426, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8844881653785706, + "num_tokens": 492396820.0, + "step": 12910 + }, + { + "epoch": 1.6424119068820762, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9999299049377441, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8719065189361572, + "num_tokens": 492435282.0, + "step": 12911 + }, + { + "epoch": 1.6425391171606667, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9171268939971924, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8729605078697205, + "num_tokens": 492470195.0, + "step": 12912 + }, + { + "epoch": 1.6426663274392572, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8548189401626587, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.85743248462677, + "num_tokens": 492514265.0, + "step": 12913 + }, + { + "epoch": 1.6427935377178478, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.900375485420227, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8661386966705322, + "num_tokens": 492550348.0, + "step": 12914 + }, + { + "epoch": 1.642920747996438, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7983638048171997, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8677588701248169, + "num_tokens": 492590648.0, + "step": 12915 + }, + { + "epoch": 1.6430479582750286, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.1224777698516846, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8662552237510681, + "num_tokens": 492629523.0, + "step": 12916 + }, + { + "epoch": 1.6431751685536191, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3160910606384277, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8595224618911743, + "num_tokens": 492668392.0, + "step": 12917 + }, + { + "epoch": 1.6433023788322096, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.298070192337036, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8540970683097839, + "num_tokens": 492696073.0, + "step": 12918 + }, + { + "epoch": 1.6434295891108002, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.063072919845581, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.866062343120575, + "num_tokens": 492728590.0, + "step": 12919 + }, + { + "epoch": 1.6435567993893907, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8733652830123901, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8579839468002319, + "num_tokens": 492767290.0, + "step": 12920 + }, + { + "epoch": 1.6436840096679812, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8246721029281616, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8757972717285156, + "num_tokens": 492801019.0, + "step": 12921 + }, + { + "epoch": 1.6438112199465715, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8573110103607178, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8634609580039978, + "num_tokens": 492841220.0, + "step": 12922 + }, + { + "epoch": 1.643938430225162, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8382699489593506, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8534270524978638, + "num_tokens": 492885076.0, + "step": 12923 + }, + { + "epoch": 1.6440656405037526, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8458037376403809, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.859815239906311, + "num_tokens": 492918554.0, + "step": 12924 + }, + { + "epoch": 1.6441928507823431, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7032541036605835, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8805712461471558, + "num_tokens": 492961890.0, + "step": 12925 + }, + { + "epoch": 1.6443200610609336, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8213151693344116, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.874668300151825, + "num_tokens": 492997768.0, + "step": 12926 + }, + { + "epoch": 1.6444472713395242, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6772236824035645, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8889983892440796, + "num_tokens": 493037017.0, + "step": 12927 + }, + { + "epoch": 1.6445744816181147, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.750624418258667, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8740507364273071, + "num_tokens": 493074579.0, + "step": 12928 + }, + { + "epoch": 1.6447016918967052, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8251451253890991, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8665918707847595, + "num_tokens": 493111249.0, + "step": 12929 + }, + { + "epoch": 1.6448289021752958, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.157423496246338, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8485320210456848, + "num_tokens": 493150723.0, + "step": 12930 + }, + { + "epoch": 1.6449561124538863, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8972270488739014, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8614677786827087, + "num_tokens": 493189530.0, + "step": 12931 + }, + { + "epoch": 1.6450833227324768, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8292059898376465, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8714309930801392, + "num_tokens": 493231369.0, + "step": 12932 + }, + { + "epoch": 1.6452105330110673, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9050348997116089, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.850205659866333, + "num_tokens": 493278430.0, + "step": 12933 + }, + { + "epoch": 1.6453377432896579, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9833195209503174, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8701727390289307, + "num_tokens": 493312258.0, + "step": 12934 + }, + { + "epoch": 1.6454649535682484, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0008411407470703, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8630915880203247, + "num_tokens": 493347725.0, + "step": 12935 + }, + { + "epoch": 1.645592163846839, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.833460807800293, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8678098320960999, + "num_tokens": 493388119.0, + "step": 12936 + }, + { + "epoch": 1.6457193741254295, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1323533058166504, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8666708469390869, + "num_tokens": 493424607.0, + "step": 12937 + }, + { + "epoch": 1.64584658440402, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.016052007675171, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8421475887298584, + "num_tokens": 493459546.0, + "step": 12938 + }, + { + "epoch": 1.6459737946826105, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9181058406829834, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8682371377944946, + "num_tokens": 493494162.0, + "step": 12939 + }, + { + "epoch": 1.6461010049612008, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 16.60491371154785, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8625463843345642, + "num_tokens": 493537523.0, + "step": 12940 + }, + { + "epoch": 1.6462282152397913, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.207489252090454, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8635334372520447, + "num_tokens": 493580065.0, + "step": 12941 + }, + { + "epoch": 1.6463554255183819, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.903071403503418, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8714317679405212, + "num_tokens": 493626717.0, + "step": 12942 + }, + { + "epoch": 1.6464826357969724, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.416609287261963, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8709027767181396, + "num_tokens": 493666266.0, + "step": 12943 + }, + { + "epoch": 1.646609846075563, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9384406805038452, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8540942072868347, + "num_tokens": 493703184.0, + "step": 12944 + }, + { + "epoch": 1.6467370563541535, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7092043161392212, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8800094723701477, + "num_tokens": 493744061.0, + "step": 12945 + }, + { + "epoch": 1.6468642666327438, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8800652027130127, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8645302057266235, + "num_tokens": 493783585.0, + "step": 12946 + }, + { + "epoch": 1.6469914769113343, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 80.51919555664062, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8697277307510376, + "num_tokens": 493827781.0, + "step": 12947 + }, + { + "epoch": 1.6471186871899248, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.082294225692749, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8688390851020813, + "num_tokens": 493864418.0, + "step": 12948 + }, + { + "epoch": 1.6472458974685154, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.166867971420288, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8631024360656738, + "num_tokens": 493895226.0, + "step": 12949 + }, + { + "epoch": 1.6473731077471059, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.054140090942383, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8555641174316406, + "num_tokens": 493935210.0, + "step": 12950 + }, + { + "epoch": 1.6475003180256964, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.977197527885437, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8645051717758179, + "num_tokens": 493966368.0, + "step": 12951 + }, + { + "epoch": 1.647627528304287, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8805341720581055, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8571791648864746, + "num_tokens": 494004304.0, + "step": 12952 + }, + { + "epoch": 1.6477547385828775, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.828941822052002, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8725559711456299, + "num_tokens": 494044758.0, + "step": 12953 + }, + { + "epoch": 1.647881948861468, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8746026754379272, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8474940061569214, + "num_tokens": 494080624.0, + "step": 12954 + }, + { + "epoch": 1.6480091591400585, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8370885848999023, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8691114783287048, + "num_tokens": 494116215.0, + "step": 12955 + }, + { + "epoch": 1.648136369418649, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9277172088623047, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8629327416419983, + "num_tokens": 494150317.0, + "step": 12956 + }, + { + "epoch": 1.6482635796972396, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8769755363464355, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8780835866928101, + "num_tokens": 494187273.0, + "step": 12957 + }, + { + "epoch": 1.64839078997583, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8676453828811646, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8618032932281494, + "num_tokens": 494226921.0, + "step": 12958 + }, + { + "epoch": 1.6485180002544206, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0610392093658447, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.870196521282196, + "num_tokens": 494263141.0, + "step": 12959 + }, + { + "epoch": 1.6486452105330112, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7898229360580444, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8600693345069885, + "num_tokens": 494307925.0, + "step": 12960 + }, + { + "epoch": 1.6487724208116017, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7630562782287598, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8581165075302124, + "num_tokens": 494353428.0, + "step": 12961 + }, + { + "epoch": 1.6488996310901922, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.034882068634033, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8521124124526978, + "num_tokens": 494383559.0, + "step": 12962 + }, + { + "epoch": 1.6490268413687827, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.978271245956421, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8654106259346008, + "num_tokens": 494419564.0, + "step": 12963 + }, + { + "epoch": 1.649154051647373, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8264636993408203, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8674184083938599, + "num_tokens": 494466886.0, + "step": 12964 + }, + { + "epoch": 1.6492812619259636, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9708855152130127, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8464557528495789, + "num_tokens": 494506672.0, + "step": 12965 + }, + { + "epoch": 1.649408472204554, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0497565269470215, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8650656938552856, + "num_tokens": 494541881.0, + "step": 12966 + }, + { + "epoch": 1.6495356824831446, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.777449131011963, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8697319030761719, + "num_tokens": 494583546.0, + "step": 12967 + }, + { + "epoch": 1.6496628927617352, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0853283405303955, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8523869514465332, + "num_tokens": 494616625.0, + "step": 12968 + }, + { + "epoch": 1.6497901030403257, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7973703145980835, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8784103393554688, + "num_tokens": 494654590.0, + "step": 12969 + }, + { + "epoch": 1.6499173133189162, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.773221731185913, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.878997266292572, + "num_tokens": 494692471.0, + "step": 12970 + }, + { + "epoch": 1.6500445235975065, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8448853492736816, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8566666841506958, + "num_tokens": 494730347.0, + "step": 12971 + }, + { + "epoch": 1.650171733876097, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.975062608718872, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8521673679351807, + "num_tokens": 494763753.0, + "step": 12972 + }, + { + "epoch": 1.6502989441546876, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9605532884597778, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8588853478431702, + "num_tokens": 494800416.0, + "step": 12973 + }, + { + "epoch": 1.6504261544332781, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8988641500473022, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8631192445755005, + "num_tokens": 494842733.0, + "step": 12974 + }, + { + "epoch": 1.6505533647118686, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0759124755859375, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8565639853477478, + "num_tokens": 494878661.0, + "step": 12975 + }, + { + "epoch": 1.6506805749904592, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8429826498031616, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8562028408050537, + "num_tokens": 494918142.0, + "step": 12976 + }, + { + "epoch": 1.6508077852690497, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9947338104248047, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8629868030548096, + "num_tokens": 494956947.0, + "step": 12977 + }, + { + "epoch": 1.6509349955476402, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9697439670562744, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8584233522415161, + "num_tokens": 494989850.0, + "step": 12978 + }, + { + "epoch": 1.6510622058262308, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.998012900352478, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8561438918113708, + "num_tokens": 495026770.0, + "step": 12979 + }, + { + "epoch": 1.6511894161048213, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8359096050262451, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8765077590942383, + "num_tokens": 495065009.0, + "step": 12980 + }, + { + "epoch": 1.6513166263834118, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9540812969207764, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8645985126495361, + "num_tokens": 495096914.0, + "step": 12981 + }, + { + "epoch": 1.6514438366620023, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1462032794952393, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8687244653701782, + "num_tokens": 495138348.0, + "step": 12982 + }, + { + "epoch": 1.6515710469405929, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8307969570159912, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8584932088851929, + "num_tokens": 495177026.0, + "step": 12983 + }, + { + "epoch": 1.6516982572191834, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9720008373260498, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8656577467918396, + "num_tokens": 495213850.0, + "step": 12984 + }, + { + "epoch": 1.651825467497774, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.596675872802734, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8728018403053284, + "num_tokens": 495257213.0, + "step": 12985 + }, + { + "epoch": 1.6519526777763645, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9754812717437744, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8550338745117188, + "num_tokens": 495300018.0, + "step": 12986 + }, + { + "epoch": 1.652079888054955, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9723529815673828, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8644586205482483, + "num_tokens": 495336471.0, + "step": 12987 + }, + { + "epoch": 1.6522070983335455, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9234116077423096, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8678354024887085, + "num_tokens": 495374680.0, + "step": 12988 + }, + { + "epoch": 1.6523343086121358, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7832906246185303, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8677363395690918, + "num_tokens": 495414096.0, + "step": 12989 + }, + { + "epoch": 1.6524615188907263, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8215545415878296, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.852908194065094, + "num_tokens": 495452955.0, + "step": 12990 + }, + { + "epoch": 1.6525887291693169, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8668367862701416, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8614972829818726, + "num_tokens": 495490091.0, + "step": 12991 + }, + { + "epoch": 1.6527159394479074, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9309585094451904, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8716980218887329, + "num_tokens": 495524543.0, + "step": 12992 + }, + { + "epoch": 1.652843149726498, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.957818627357483, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8553797006607056, + "num_tokens": 495562068.0, + "step": 12993 + }, + { + "epoch": 1.6529703600050885, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0826075077056885, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8730416297912598, + "num_tokens": 495598355.0, + "step": 12994 + }, + { + "epoch": 1.6530975702836788, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8865829706192017, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8496593832969666, + "num_tokens": 495634735.0, + "step": 12995 + }, + { + "epoch": 1.6532247805622693, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1882684230804443, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8642837405204773, + "num_tokens": 495668804.0, + "step": 12996 + }, + { + "epoch": 1.6533519908408598, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.941579818725586, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8679473400115967, + "num_tokens": 495707129.0, + "step": 12997 + }, + { + "epoch": 1.6534792011194503, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9738446474075317, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8440808057785034, + "num_tokens": 495746501.0, + "step": 12998 + }, + { + "epoch": 1.6536064113980409, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8409279584884644, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8798975944519043, + "num_tokens": 495789723.0, + "step": 12999 + }, + { + "epoch": 1.6537336216766314, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9961236715316772, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8555052876472473, + "num_tokens": 495827073.0, + "step": 13000 + }, + { + "epoch": 1.653860831955222, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9631112813949585, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8538093566894531, + "num_tokens": 495862362.0, + "step": 13001 + }, + { + "epoch": 1.6539880422338125, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8199933767318726, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8667959570884705, + "num_tokens": 495905546.0, + "step": 13002 + }, + { + "epoch": 1.654115252512403, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.103842258453369, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8617342710494995, + "num_tokens": 495942291.0, + "step": 13003 + }, + { + "epoch": 1.6542424627909935, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9189445972442627, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8542938828468323, + "num_tokens": 495978631.0, + "step": 13004 + }, + { + "epoch": 1.654369673069584, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.914758324623108, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8583875894546509, + "num_tokens": 496016626.0, + "step": 13005 + }, + { + "epoch": 1.6544968833481746, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9555519819259644, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8557971119880676, + "num_tokens": 496051949.0, + "step": 13006 + }, + { + "epoch": 1.654624093626765, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8207111358642578, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.870192289352417, + "num_tokens": 496092693.0, + "step": 13007 + }, + { + "epoch": 1.6547513039053556, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8577923774719238, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.858534574508667, + "num_tokens": 496132853.0, + "step": 13008 + }, + { + "epoch": 1.6548785141839462, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9718070030212402, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8495568037033081, + "num_tokens": 496168303.0, + "step": 13009 + }, + { + "epoch": 1.6550057244625367, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8795632123947144, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8512012958526611, + "num_tokens": 496206132.0, + "step": 13010 + }, + { + "epoch": 1.6551329347411272, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.902851939201355, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.857724666595459, + "num_tokens": 496247542.0, + "step": 13011 + }, + { + "epoch": 1.6552601450197177, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.726128101348877, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8753458261489868, + "num_tokens": 496285698.0, + "step": 13012 + }, + { + "epoch": 1.655387355298308, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9782333374023438, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8652310967445374, + "num_tokens": 496322216.0, + "step": 13013 + }, + { + "epoch": 1.6555145655768986, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.050762414932251, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8604183793067932, + "num_tokens": 496357687.0, + "step": 13014 + }, + { + "epoch": 1.655641775855489, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8674697875976562, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8757141828536987, + "num_tokens": 496400875.0, + "step": 13015 + }, + { + "epoch": 1.6557689861340796, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0237855911254883, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8662244081497192, + "num_tokens": 496432076.0, + "step": 13016 + }, + { + "epoch": 1.6558961964126702, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.062699556350708, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.861284077167511, + "num_tokens": 496466852.0, + "step": 13017 + }, + { + "epoch": 1.6560234066912607, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8121596574783325, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.862647294998169, + "num_tokens": 496508890.0, + "step": 13018 + }, + { + "epoch": 1.6561506169698512, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.990600347518921, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8790467381477356, + "num_tokens": 496537964.0, + "step": 13019 + }, + { + "epoch": 1.6562778272484415, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2245664596557617, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8462398052215576, + "num_tokens": 496571263.0, + "step": 13020 + }, + { + "epoch": 1.656405037527032, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.159663677215576, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8712829351425171, + "num_tokens": 496610058.0, + "step": 13021 + }, + { + "epoch": 1.6565322478056226, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.008193254470825, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8577530980110168, + "num_tokens": 496640542.0, + "step": 13022 + }, + { + "epoch": 1.656659458084213, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.0306413173675537, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8608160614967346, + "num_tokens": 496678030.0, + "step": 13023 + }, + { + "epoch": 1.6567866683628036, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9522725343704224, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8604819178581238, + "num_tokens": 496714057.0, + "step": 13024 + }, + { + "epoch": 1.6569138786413942, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9673106670379639, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8798314332962036, + "num_tokens": 496744501.0, + "step": 13025 + }, + { + "epoch": 1.6570410889199847, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.000185251235962, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8689008355140686, + "num_tokens": 496780233.0, + "step": 13026 + }, + { + "epoch": 1.6571682991985752, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.999367594718933, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8645743131637573, + "num_tokens": 496809645.0, + "step": 13027 + }, + { + "epoch": 1.6572955094771658, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.8421711921691895, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8642088770866394, + "num_tokens": 496854871.0, + "step": 13028 + }, + { + "epoch": 1.6574227197557563, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8011751174926758, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8807656764984131, + "num_tokens": 496891958.0, + "step": 13029 + }, + { + "epoch": 1.6575499300343468, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.8760405778884888, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8560940623283386, + "num_tokens": 496928550.0, + "step": 13030 + }, + { + "epoch": 1.6576771403129373, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.8955094814300537, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8629366159439087, + "num_tokens": 496972009.0, + "step": 13031 + }, + { + "epoch": 1.6578043505915279, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 7.762563705444336, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8622186779975891, + "num_tokens": 497013605.0, + "step": 13032 + }, + { + "epoch": 1.6579315608701184, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.016979932785034, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8665575385093689, + "num_tokens": 497048712.0, + "step": 13033 + }, + { + "epoch": 1.658058771148709, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.8561269044876099, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8784699440002441, + "num_tokens": 497089161.0, + "step": 13034 + }, + { + "epoch": 1.6581859814272994, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9798800945281982, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8351494073867798, + "num_tokens": 497127253.0, + "step": 13035 + }, + { + "epoch": 1.65831319170589, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.6993221044540405, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.859600305557251, + "num_tokens": 497167637.0, + "step": 13036 + }, + { + "epoch": 1.6584404019844805, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9087105989456177, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8643388152122498, + "num_tokens": 497205663.0, + "step": 13037 + }, + { + "epoch": 1.6585676122630708, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8711878061294556, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8801921606063843, + "num_tokens": 497242146.0, + "step": 13038 + }, + { + "epoch": 1.6586948225416613, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7915157079696655, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8697474598884583, + "num_tokens": 497279339.0, + "step": 13039 + }, + { + "epoch": 1.6588220328202519, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.87663733959198, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8550351858139038, + "num_tokens": 497318530.0, + "step": 13040 + }, + { + "epoch": 1.6589492430988424, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9695223569869995, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8562567234039307, + "num_tokens": 497352943.0, + "step": 13041 + }, + { + "epoch": 1.659076453377433, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0292816162109375, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8590683341026306, + "num_tokens": 497387729.0, + "step": 13042 + }, + { + "epoch": 1.6592036636560235, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 16.600309371948242, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8644425868988037, + "num_tokens": 497427875.0, + "step": 13043 + }, + { + "epoch": 1.6593308739346138, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.0396668910980225, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8559731841087341, + "num_tokens": 497469076.0, + "step": 13044 + }, + { + "epoch": 1.6594580842132043, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.878418207168579, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8655170202255249, + "num_tokens": 497509556.0, + "step": 13045 + }, + { + "epoch": 1.6595852944917948, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.0334742069244385, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8558545112609863, + "num_tokens": 497548151.0, + "step": 13046 + }, + { + "epoch": 1.6597125047703853, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 7.795001029968262, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8383641242980957, + "num_tokens": 497589088.0, + "step": 13047 + }, + { + "epoch": 1.6598397150489759, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7902307510375977, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8734211921691895, + "num_tokens": 497632095.0, + "step": 13048 + }, + { + "epoch": 1.6599669253275664, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.0276854038238525, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8755028247833252, + "num_tokens": 497665148.0, + "step": 13049 + }, + { + "epoch": 1.660094135606157, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.0714433193206787, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.857559859752655, + "num_tokens": 497700134.0, + "step": 13050 + }, + { + "epoch": 1.6602213458847475, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.7112230062484741, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8677233457565308, + "num_tokens": 497743101.0, + "step": 13051 + }, + { + "epoch": 1.660348556163338, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9102779626846313, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8549451231956482, + "num_tokens": 497775112.0, + "step": 13052 + }, + { + "epoch": 1.6604757664419285, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8365339040756226, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.874503493309021, + "num_tokens": 497813320.0, + "step": 13053 + }, + { + "epoch": 1.660602976720519, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7594609260559082, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8583780527114868, + "num_tokens": 497852780.0, + "step": 13054 + }, + { + "epoch": 1.6607301869991096, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9498268365859985, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8661859631538391, + "num_tokens": 497886313.0, + "step": 13055 + }, + { + "epoch": 1.6608573972777, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8737246990203857, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8616467714309692, + "num_tokens": 497924108.0, + "step": 13056 + }, + { + "epoch": 1.6609846075562906, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9118356704711914, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8538689613342285, + "num_tokens": 497964656.0, + "step": 13057 + }, + { + "epoch": 1.6611118178348812, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9310141801834106, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8503680229187012, + "num_tokens": 498005436.0, + "step": 13058 + }, + { + "epoch": 1.6612390281134717, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8187693357467651, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8601332902908325, + "num_tokens": 498042237.0, + "step": 13059 + }, + { + "epoch": 1.6613662383920622, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8300135135650635, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8628191947937012, + "num_tokens": 498083008.0, + "step": 13060 + }, + { + "epoch": 1.6614934486706527, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3968076705932617, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8410404920578003, + "num_tokens": 498120550.0, + "step": 13061 + }, + { + "epoch": 1.661620658949243, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0035905838012695, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8597057461738586, + "num_tokens": 498153033.0, + "step": 13062 + }, + { + "epoch": 1.6617478692278336, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9334136247634888, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8653781414031982, + "num_tokens": 498190491.0, + "step": 13063 + }, + { + "epoch": 1.661875079506424, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7994853258132935, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8560967445373535, + "num_tokens": 498235889.0, + "step": 13064 + }, + { + "epoch": 1.6620022897850146, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7855989933013916, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.861005425453186, + "num_tokens": 498277191.0, + "step": 13065 + }, + { + "epoch": 1.6621295000636052, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.885716438293457, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8519361019134521, + "num_tokens": 498318443.0, + "step": 13066 + }, + { + "epoch": 1.6622567103421957, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1661345958709717, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8511794805526733, + "num_tokens": 498355955.0, + "step": 13067 + }, + { + "epoch": 1.662383920620786, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.028566837310791, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8644012212753296, + "num_tokens": 498388588.0, + "step": 13068 + }, + { + "epoch": 1.6625111308993765, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.021808385848999, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8612911105155945, + "num_tokens": 498430661.0, + "step": 13069 + }, + { + "epoch": 1.662638341177967, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.829101800918579, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.852016270160675, + "num_tokens": 498470428.0, + "step": 13070 + }, + { + "epoch": 1.6627655514565576, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7074835300445557, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8636552095413208, + "num_tokens": 498512031.0, + "step": 13071 + }, + { + "epoch": 1.662892761735148, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9146586656570435, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8859320282936096, + "num_tokens": 498548344.0, + "step": 13072 + }, + { + "epoch": 1.6630199720137386, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9154713153839111, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8765814900398254, + "num_tokens": 498590115.0, + "step": 13073 + }, + { + "epoch": 1.6631471822923292, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.86436927318573, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8638944625854492, + "num_tokens": 498634034.0, + "step": 13074 + }, + { + "epoch": 1.6632743925709197, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9509658813476562, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8544352650642395, + "num_tokens": 498675086.0, + "step": 13075 + }, + { + "epoch": 1.6634016028495102, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7956594228744507, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.871082067489624, + "num_tokens": 498711374.0, + "step": 13076 + }, + { + "epoch": 1.6635288131281007, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.322002649307251, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8698019981384277, + "num_tokens": 498753050.0, + "step": 13077 + }, + { + "epoch": 1.6636560234066913, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9518831968307495, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8645597100257874, + "num_tokens": 498790406.0, + "step": 13078 + }, + { + "epoch": 1.6637832336852818, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1798057556152344, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8617671132087708, + "num_tokens": 498827712.0, + "step": 13079 + }, + { + "epoch": 1.6639104439638723, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9593513011932373, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8681323528289795, + "num_tokens": 498862283.0, + "step": 13080 + }, + { + "epoch": 1.6640376542424629, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7754939794540405, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.864901602268219, + "num_tokens": 498902686.0, + "step": 13081 + }, + { + "epoch": 1.6641648645210534, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.936015248298645, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8519885540008545, + "num_tokens": 498942785.0, + "step": 13082 + }, + { + "epoch": 1.664292074799644, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8218848705291748, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.869292140007019, + "num_tokens": 498982761.0, + "step": 13083 + }, + { + "epoch": 1.6644192850782344, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9953540563583374, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8453197479248047, + "num_tokens": 499025762.0, + "step": 13084 + }, + { + "epoch": 1.664546495356825, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9280375242233276, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8548444509506226, + "num_tokens": 499063394.0, + "step": 13085 + }, + { + "epoch": 1.6646737056354155, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7658518552780151, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8742793798446655, + "num_tokens": 499105824.0, + "step": 13086 + }, + { + "epoch": 1.6648009159140058, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9021062850952148, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8709627389907837, + "num_tokens": 499140935.0, + "step": 13087 + }, + { + "epoch": 1.6649281261925963, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9057776927947998, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8644416928291321, + "num_tokens": 499179492.0, + "step": 13088 + }, + { + "epoch": 1.6650553364711869, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8685089349746704, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8744279146194458, + "num_tokens": 499214716.0, + "step": 13089 + }, + { + "epoch": 1.6651825467497774, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8111895322799683, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8704890012741089, + "num_tokens": 499250337.0, + "step": 13090 + }, + { + "epoch": 1.665309757028368, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0180208683013916, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8677413463592529, + "num_tokens": 499283675.0, + "step": 13091 + }, + { + "epoch": 1.6654369673069584, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9023834466934204, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8677540421485901, + "num_tokens": 499319153.0, + "step": 13092 + }, + { + "epoch": 1.6655641775855488, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8362294435501099, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8600964546203613, + "num_tokens": 499356675.0, + "step": 13093 + }, + { + "epoch": 1.6656913878641393, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8526273965835571, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8786858916282654, + "num_tokens": 499392129.0, + "step": 13094 + }, + { + "epoch": 1.6658185981427298, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9216663837432861, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8745852708816528, + "num_tokens": 499424958.0, + "step": 13095 + }, + { + "epoch": 1.6659458084213203, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8327264785766602, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.872199535369873, + "num_tokens": 499467769.0, + "step": 13096 + }, + { + "epoch": 1.6660730186999109, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.924518346786499, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8535233736038208, + "num_tokens": 499513828.0, + "step": 13097 + }, + { + "epoch": 1.6662002289785014, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0334460735321045, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8690902590751648, + "num_tokens": 499548219.0, + "step": 13098 + }, + { + "epoch": 1.666327439257092, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9385879039764404, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8476846218109131, + "num_tokens": 499587158.0, + "step": 13099 + }, + { + "epoch": 1.6664546495356825, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.918133020401001, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8663886785507202, + "num_tokens": 499625235.0, + "step": 13100 + }, + { + "epoch": 1.666581859814273, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.948971152305603, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.844618558883667, + "num_tokens": 499662842.0, + "step": 13101 + }, + { + "epoch": 1.6667090700928635, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8055338859558105, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8412666320800781, + "num_tokens": 499703464.0, + "step": 13102 + }, + { + "epoch": 1.666836280371454, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9337644577026367, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8627109527587891, + "num_tokens": 499739160.0, + "step": 13103 + }, + { + "epoch": 1.6669634906500446, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9006394147872925, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8509304523468018, + "num_tokens": 499775024.0, + "step": 13104 + }, + { + "epoch": 1.667090700928635, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8994165658950806, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8662350177764893, + "num_tokens": 499816479.0, + "step": 13105 + }, + { + "epoch": 1.6672179112072256, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9337272644042969, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8659082651138306, + "num_tokens": 499848790.0, + "step": 13106 + }, + { + "epoch": 1.6673451214858162, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.7331957817077637, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.882501482963562, + "num_tokens": 499890955.0, + "step": 13107 + }, + { + "epoch": 1.6674723317644067, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.952376365661621, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.872241735458374, + "num_tokens": 499924813.0, + "step": 13108 + }, + { + "epoch": 1.6675995420429972, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.811929702758789, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8605947494506836, + "num_tokens": 499960032.0, + "step": 13109 + }, + { + "epoch": 1.6677267523215877, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.281205654144287, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8780689239501953, + "num_tokens": 499995868.0, + "step": 13110 + }, + { + "epoch": 1.667853962600178, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.887739658355713, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8682308197021484, + "num_tokens": 500031662.0, + "step": 13111 + }, + { + "epoch": 1.6679811728787686, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8103363513946533, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8549745678901672, + "num_tokens": 500070189.0, + "step": 13112 + }, + { + "epoch": 1.668108383157359, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.258234739303589, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8603966236114502, + "num_tokens": 500109999.0, + "step": 13113 + }, + { + "epoch": 1.6682355934359496, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9530867338180542, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8471373319625854, + "num_tokens": 500151473.0, + "step": 13114 + }, + { + "epoch": 1.6683628037145402, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9636938571929932, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8617227077484131, + "num_tokens": 500187436.0, + "step": 13115 + }, + { + "epoch": 1.6684900139931307, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0802981853485107, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8629006743431091, + "num_tokens": 500222433.0, + "step": 13116 + }, + { + "epoch": 1.668617224271721, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8454017639160156, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8654941320419312, + "num_tokens": 500257390.0, + "step": 13117 + }, + { + "epoch": 1.6687444345503115, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.890533208847046, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8637784123420715, + "num_tokens": 500301260.0, + "step": 13118 + }, + { + "epoch": 1.668871644828902, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8886942863464355, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8699935078620911, + "num_tokens": 500343453.0, + "step": 13119 + }, + { + "epoch": 1.6689988551074926, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9412866830825806, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8711405992507935, + "num_tokens": 500375133.0, + "step": 13120 + }, + { + "epoch": 1.669126065386083, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9472955465316772, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8590100407600403, + "num_tokens": 500408540.0, + "step": 13121 + }, + { + "epoch": 1.6692532756646736, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9142401218414307, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8763693571090698, + "num_tokens": 500446243.0, + "step": 13122 + }, + { + "epoch": 1.6693804859432642, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.510744571685791, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.840256929397583, + "num_tokens": 500484313.0, + "step": 13123 + }, + { + "epoch": 1.6695076962218547, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8490400314331055, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8755121827125549, + "num_tokens": 500525566.0, + "step": 13124 + }, + { + "epoch": 1.6696349065004452, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8417913913726807, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8692534565925598, + "num_tokens": 500566274.0, + "step": 13125 + }, + { + "epoch": 1.6697621167790357, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.030592203140259, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8554054498672485, + "num_tokens": 500602769.0, + "step": 13126 + }, + { + "epoch": 1.6698893270576263, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.984610676765442, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8477368354797363, + "num_tokens": 500642895.0, + "step": 13127 + }, + { + "epoch": 1.6700165373362168, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3041012287139893, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8603876829147339, + "num_tokens": 500681461.0, + "step": 13128 + }, + { + "epoch": 1.6701437476148073, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8944518566131592, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.848580002784729, + "num_tokens": 500723395.0, + "step": 13129 + }, + { + "epoch": 1.6702709578933979, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8851203918457031, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8654179573059082, + "num_tokens": 500760571.0, + "step": 13130 + }, + { + "epoch": 1.6703981681719884, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.021925210952759, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8544121980667114, + "num_tokens": 500797858.0, + "step": 13131 + }, + { + "epoch": 1.670525378450579, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.742706060409546, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8700767755508423, + "num_tokens": 500840199.0, + "step": 13132 + }, + { + "epoch": 1.6706525887291694, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9297257661819458, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8577282428741455, + "num_tokens": 500883290.0, + "step": 13133 + }, + { + "epoch": 1.67077979900776, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0178258419036865, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8670664429664612, + "num_tokens": 500923072.0, + "step": 13134 + }, + { + "epoch": 1.6709070092863505, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0332882404327393, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8670499920845032, + "num_tokens": 500958726.0, + "step": 13135 + }, + { + "epoch": 1.6710342195649408, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.010831832885742, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8600825071334839, + "num_tokens": 500996746.0, + "step": 13136 + }, + { + "epoch": 1.6711614298435313, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.978428840637207, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8553162813186646, + "num_tokens": 501034135.0, + "step": 13137 + }, + { + "epoch": 1.6712886401221219, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8965555429458618, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.865898847579956, + "num_tokens": 501069810.0, + "step": 13138 + }, + { + "epoch": 1.6714158504007124, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9010977745056152, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8573775291442871, + "num_tokens": 501110597.0, + "step": 13139 + }, + { + "epoch": 1.671543060679303, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.964487075805664, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8520898222923279, + "num_tokens": 501145927.0, + "step": 13140 + }, + { + "epoch": 1.6716702709578934, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2229127883911133, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8440628051757812, + "num_tokens": 501188473.0, + "step": 13141 + }, + { + "epoch": 1.6717974812364838, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8142706155776978, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.876721978187561, + "num_tokens": 501226143.0, + "step": 13142 + }, + { + "epoch": 1.6719246915150743, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8339797258377075, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8541601300239563, + "num_tokens": 501266980.0, + "step": 13143 + }, + { + "epoch": 1.6720519017936648, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.974892497062683, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8606134057044983, + "num_tokens": 501305414.0, + "step": 13144 + }, + { + "epoch": 1.6721791120722553, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8940227031707764, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8549649715423584, + "num_tokens": 501343009.0, + "step": 13145 + }, + { + "epoch": 1.6723063223508459, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7479292154312134, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8682037591934204, + "num_tokens": 501383571.0, + "step": 13146 + }, + { + "epoch": 1.6724335326294364, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7011233568191528, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8660137057304382, + "num_tokens": 501425444.0, + "step": 13147 + }, + { + "epoch": 1.672560742908027, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8559740781784058, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8759831190109253, + "num_tokens": 501466536.0, + "step": 13148 + }, + { + "epoch": 1.6726879531866174, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9477633237838745, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8661317825317383, + "num_tokens": 501507346.0, + "step": 13149 + }, + { + "epoch": 1.672815163465208, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9449198246002197, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8702322244644165, + "num_tokens": 501545515.0, + "step": 13150 + }, + { + "epoch": 1.6729423737437985, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.91291344165802, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8566941618919373, + "num_tokens": 501588793.0, + "step": 13151 + }, + { + "epoch": 1.673069584022389, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.946088194847107, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8627957701683044, + "num_tokens": 501624384.0, + "step": 13152 + }, + { + "epoch": 1.6731967943009796, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9719080924987793, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8640781044960022, + "num_tokens": 501663150.0, + "step": 13153 + }, + { + "epoch": 1.67332400457957, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9148573875427246, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8561997413635254, + "num_tokens": 501706177.0, + "step": 13154 + }, + { + "epoch": 1.6734512148581606, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8437988758087158, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.875001072883606, + "num_tokens": 501745267.0, + "step": 13155 + }, + { + "epoch": 1.6735784251367511, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7992757558822632, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8584887981414795, + "num_tokens": 501783591.0, + "step": 13156 + }, + { + "epoch": 1.6737056354153417, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8115068674087524, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8541550636291504, + "num_tokens": 501825190.0, + "step": 13157 + }, + { + "epoch": 1.6738328456939322, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1772968769073486, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8632450103759766, + "num_tokens": 501855813.0, + "step": 13158 + }, + { + "epoch": 1.6739600559725227, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.010152578353882, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8601575493812561, + "num_tokens": 501890106.0, + "step": 13159 + }, + { + "epoch": 1.674087266251113, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8702828884124756, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8493958115577698, + "num_tokens": 501935437.0, + "step": 13160 + }, + { + "epoch": 1.6742144765297036, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8094581365585327, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8513584136962891, + "num_tokens": 501980134.0, + "step": 13161 + }, + { + "epoch": 1.674341686808294, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.008024215698242, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8576319813728333, + "num_tokens": 502015977.0, + "step": 13162 + }, + { + "epoch": 1.6744688970868846, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7798954248428345, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8643616437911987, + "num_tokens": 502061717.0, + "step": 13163 + }, + { + "epoch": 1.6745961073654752, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8255479335784912, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8614992499351501, + "num_tokens": 502101594.0, + "step": 13164 + }, + { + "epoch": 1.6747233176440657, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8391809463500977, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8628711700439453, + "num_tokens": 502136521.0, + "step": 13165 + }, + { + "epoch": 1.674850527922656, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.102296829223633, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8434919118881226, + "num_tokens": 502174821.0, + "step": 13166 + }, + { + "epoch": 1.6749777382012465, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.101797103881836, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8567088842391968, + "num_tokens": 502210615.0, + "step": 13167 + }, + { + "epoch": 1.675104948479837, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8458061218261719, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8635738492012024, + "num_tokens": 502248220.0, + "step": 13168 + }, + { + "epoch": 1.6752321587584276, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8538676500320435, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8634514808654785, + "num_tokens": 502290686.0, + "step": 13169 + }, + { + "epoch": 1.675359369037018, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1220102310180664, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8513431549072266, + "num_tokens": 502331789.0, + "step": 13170 + }, + { + "epoch": 1.6754865793156086, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9064068794250488, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.869418203830719, + "num_tokens": 502371490.0, + "step": 13171 + }, + { + "epoch": 1.6756137895941992, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2030138969421387, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8617604970932007, + "num_tokens": 502415606.0, + "step": 13172 + }, + { + "epoch": 1.6757409998727897, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8170075416564941, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8627587556838989, + "num_tokens": 502456203.0, + "step": 13173 + }, + { + "epoch": 1.6758682101513802, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.833013892173767, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8789613842964172, + "num_tokens": 502491508.0, + "step": 13174 + }, + { + "epoch": 1.6759954204299707, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8974497318267822, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8518596291542053, + "num_tokens": 502533408.0, + "step": 13175 + }, + { + "epoch": 1.6761226307085613, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8838653564453125, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8553161025047302, + "num_tokens": 502569824.0, + "step": 13176 + }, + { + "epoch": 1.6762498409871518, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9187345504760742, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.871126651763916, + "num_tokens": 502604776.0, + "step": 13177 + }, + { + "epoch": 1.6763770512657423, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.274031162261963, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.845183789730072, + "num_tokens": 502645732.0, + "step": 13178 + }, + { + "epoch": 1.6765042615443329, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.957648515701294, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8659579753875732, + "num_tokens": 502686044.0, + "step": 13179 + }, + { + "epoch": 1.6766314718229234, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8553626537322998, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8537014126777649, + "num_tokens": 502728288.0, + "step": 13180 + }, + { + "epoch": 1.676758682101514, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9350242614746094, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8544243574142456, + "num_tokens": 502760772.0, + "step": 13181 + }, + { + "epoch": 1.6768858923801044, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9836969375610352, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8630180954933167, + "num_tokens": 502791585.0, + "step": 13182 + }, + { + "epoch": 1.677013102658695, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0840396881103516, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8798514008522034, + "num_tokens": 502826339.0, + "step": 13183 + }, + { + "epoch": 1.6771403129372855, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.708716630935669, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8803629875183105, + "num_tokens": 502868716.0, + "step": 13184 + }, + { + "epoch": 1.6772675232158758, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.920271396636963, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8630970120429993, + "num_tokens": 502907992.0, + "step": 13185 + }, + { + "epoch": 1.6773947334944663, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0246129035949707, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8568053245544434, + "num_tokens": 502947197.0, + "step": 13186 + }, + { + "epoch": 1.6775219437730569, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9365301132202148, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8720343112945557, + "num_tokens": 502985890.0, + "step": 13187 + }, + { + "epoch": 1.6776491540516474, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7084916830062866, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8566778898239136, + "num_tokens": 503033476.0, + "step": 13188 + }, + { + "epoch": 1.677776364330238, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.06988525390625, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8716070652008057, + "num_tokens": 503068002.0, + "step": 13189 + }, + { + "epoch": 1.6779035746088284, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.125047445297241, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8554694056510925, + "num_tokens": 503110059.0, + "step": 13190 + }, + { + "epoch": 1.6780307848874187, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.023468017578125, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.856336772441864, + "num_tokens": 503146617.0, + "step": 13191 + }, + { + "epoch": 1.6781579951660093, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8757933378219604, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8600180745124817, + "num_tokens": 503186436.0, + "step": 13192 + }, + { + "epoch": 1.6782852054445998, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.148975133895874, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8675384521484375, + "num_tokens": 503227983.0, + "step": 13193 + }, + { + "epoch": 1.6784124157231903, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.01865816116333, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8497328758239746, + "num_tokens": 503269301.0, + "step": 13194 + }, + { + "epoch": 1.6785396260017809, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8823573589324951, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.844215989112854, + "num_tokens": 503308919.0, + "step": 13195 + }, + { + "epoch": 1.6786668362803714, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9265141487121582, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8720942139625549, + "num_tokens": 503345030.0, + "step": 13196 + }, + { + "epoch": 1.678794046558962, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.037602663040161, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8545739054679871, + "num_tokens": 503379142.0, + "step": 13197 + }, + { + "epoch": 1.6789212568375524, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.381667137145996, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8634381890296936, + "num_tokens": 503411810.0, + "step": 13198 + }, + { + "epoch": 1.679048467116143, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0478146076202393, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.859541654586792, + "num_tokens": 503450266.0, + "step": 13199 + }, + { + "epoch": 1.6791756773947335, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8979098796844482, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8701969981193542, + "num_tokens": 503491015.0, + "step": 13200 + }, + { + "epoch": 1.679302887673324, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8288404941558838, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.870705246925354, + "num_tokens": 503531964.0, + "step": 13201 + }, + { + "epoch": 1.6794300979519146, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9420578479766846, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8657206892967224, + "num_tokens": 503576069.0, + "step": 13202 + }, + { + "epoch": 1.679557308230505, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0301144123077393, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8638177514076233, + "num_tokens": 503607895.0, + "step": 13203 + }, + { + "epoch": 1.6796845185090956, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.8085758686065674, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8569151163101196, + "num_tokens": 503636891.0, + "step": 13204 + }, + { + "epoch": 1.6798117287876861, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7061095237731934, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8773943185806274, + "num_tokens": 503682037.0, + "step": 13205 + }, + { + "epoch": 1.6799389390662767, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7732352018356323, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8658002614974976, + "num_tokens": 503724008.0, + "step": 13206 + }, + { + "epoch": 1.6800661493448672, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9187082052230835, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8715977668762207, + "num_tokens": 503759008.0, + "step": 13207 + }, + { + "epoch": 1.6801933596234577, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.7992184162139893, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8590551018714905, + "num_tokens": 503796599.0, + "step": 13208 + }, + { + "epoch": 1.680320569902048, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8473424911499023, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8601189255714417, + "num_tokens": 503835539.0, + "step": 13209 + }, + { + "epoch": 1.6804477801806386, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.95702064037323, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8589134216308594, + "num_tokens": 503875933.0, + "step": 13210 + }, + { + "epoch": 1.680574990459229, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.029244899749756, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8601281046867371, + "num_tokens": 503913740.0, + "step": 13211 + }, + { + "epoch": 1.6807022007378196, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7215267419815063, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8664335012435913, + "num_tokens": 503955105.0, + "step": 13212 + }, + { + "epoch": 1.6808294110164101, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3042752742767334, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8638908267021179, + "num_tokens": 503987641.0, + "step": 13213 + }, + { + "epoch": 1.6809566212950007, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9615683555603027, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8542934060096741, + "num_tokens": 504028699.0, + "step": 13214 + }, + { + "epoch": 1.681083831573591, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.022892475128174, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8567279577255249, + "num_tokens": 504065597.0, + "step": 13215 + }, + { + "epoch": 1.6812110418521815, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8271617889404297, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8479720950126648, + "num_tokens": 504105968.0, + "step": 13216 + }, + { + "epoch": 1.681338252130772, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.89591383934021, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8668088316917419, + "num_tokens": 504143743.0, + "step": 13217 + }, + { + "epoch": 1.6814654624093626, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9170749187469482, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8660290837287903, + "num_tokens": 504177023.0, + "step": 13218 + }, + { + "epoch": 1.681592672687953, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.002346992492676, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8613563776016235, + "num_tokens": 504210492.0, + "step": 13219 + }, + { + "epoch": 1.6817198829665436, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8202283382415771, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8623392581939697, + "num_tokens": 504248322.0, + "step": 13220 + }, + { + "epoch": 1.6818470932451342, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0114967823028564, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8480923175811768, + "num_tokens": 504285915.0, + "step": 13221 + }, + { + "epoch": 1.6819743035237247, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8056424856185913, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8610414266586304, + "num_tokens": 504325578.0, + "step": 13222 + }, + { + "epoch": 1.6821015138023152, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8138794898986816, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8828372955322266, + "num_tokens": 504362876.0, + "step": 13223 + }, + { + "epoch": 1.6822287240809057, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9401944875717163, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.860680103302002, + "num_tokens": 504402303.0, + "step": 13224 + }, + { + "epoch": 1.6823559343594963, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9390650987625122, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8630305528640747, + "num_tokens": 504440039.0, + "step": 13225 + }, + { + "epoch": 1.6824831446380868, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1132619380950928, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8537439107894897, + "num_tokens": 504485102.0, + "step": 13226 + }, + { + "epoch": 1.6826103549166773, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7352365255355835, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8615934252738953, + "num_tokens": 504522762.0, + "step": 13227 + }, + { + "epoch": 1.6827375651952678, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7233260869979858, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8676608800888062, + "num_tokens": 504562818.0, + "step": 13228 + }, + { + "epoch": 1.6828647754738584, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.221665143966675, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8536796569824219, + "num_tokens": 504602914.0, + "step": 13229 + }, + { + "epoch": 1.682991985752449, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8403526544570923, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8685827255249023, + "num_tokens": 504641376.0, + "step": 13230 + }, + { + "epoch": 1.6831191960310394, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.830502986907959, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8692846894264221, + "num_tokens": 504675031.0, + "step": 13231 + }, + { + "epoch": 1.68324640630963, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9793503284454346, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8693205118179321, + "num_tokens": 504709624.0, + "step": 13232 + }, + { + "epoch": 1.6833736165882205, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8235799074172974, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8657586574554443, + "num_tokens": 504749310.0, + "step": 13233 + }, + { + "epoch": 1.6835008268668108, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7934341430664062, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8647178411483765, + "num_tokens": 504791700.0, + "step": 13234 + }, + { + "epoch": 1.6836280371454013, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8221938610076904, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8524242043495178, + "num_tokens": 504833246.0, + "step": 13235 + }, + { + "epoch": 1.6837552474239919, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8370625972747803, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8672122955322266, + "num_tokens": 504872901.0, + "step": 13236 + }, + { + "epoch": 1.6838824577025824, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9717282056808472, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8561195135116577, + "num_tokens": 504906138.0, + "step": 13237 + }, + { + "epoch": 1.684009667981173, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9617795944213867, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8419045805931091, + "num_tokens": 504947147.0, + "step": 13238 + }, + { + "epoch": 1.6841368782597634, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.833680510520935, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8647477030754089, + "num_tokens": 504985332.0, + "step": 13239 + }, + { + "epoch": 1.6842640885383537, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8720320463180542, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8548730611801147, + "num_tokens": 505027995.0, + "step": 13240 + }, + { + "epoch": 1.6843912988169443, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.865748405456543, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8650607466697693, + "num_tokens": 505065311.0, + "step": 13241 + }, + { + "epoch": 1.6845185090955348, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.057042121887207, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8578156232833862, + "num_tokens": 505105810.0, + "step": 13242 + }, + { + "epoch": 1.6846457193741253, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.878852367401123, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8696337938308716, + "num_tokens": 505141003.0, + "step": 13243 + }, + { + "epoch": 1.6847729296527159, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8117358684539795, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8526246547698975, + "num_tokens": 505188025.0, + "step": 13244 + }, + { + "epoch": 1.6849001399313064, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6975998878479004, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8774685263633728, + "num_tokens": 505226101.0, + "step": 13245 + }, + { + "epoch": 1.685027350209897, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8520214557647705, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8536321520805359, + "num_tokens": 505264880.0, + "step": 13246 + }, + { + "epoch": 1.6851545604884874, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7860709428787231, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8554624915122986, + "num_tokens": 505306208.0, + "step": 13247 + }, + { + "epoch": 1.685281770767078, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6743661165237427, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8845968842506409, + "num_tokens": 505346024.0, + "step": 13248 + }, + { + "epoch": 1.6854089810456685, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3697264194488525, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8853548169136047, + "num_tokens": 505383644.0, + "step": 13249 + }, + { + "epoch": 1.685536191324259, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8265366554260254, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8677563667297363, + "num_tokens": 505420365.0, + "step": 13250 + }, + { + "epoch": 1.6856634016028496, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9376660585403442, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.863582968711853, + "num_tokens": 505458211.0, + "step": 13251 + }, + { + "epoch": 1.68579061188144, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8633171319961548, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8579083681106567, + "num_tokens": 505501649.0, + "step": 13252 + }, + { + "epoch": 1.6859178221600306, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8930131196975708, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.865538477897644, + "num_tokens": 505538763.0, + "step": 13253 + }, + { + "epoch": 1.6860450324386211, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7715561389923096, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.885908842086792, + "num_tokens": 505574562.0, + "step": 13254 + }, + { + "epoch": 1.6861722427172117, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8334176540374756, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8785436153411865, + "num_tokens": 505609718.0, + "step": 13255 + }, + { + "epoch": 1.6862994529958022, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.016144275665283, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8754859566688538, + "num_tokens": 505644337.0, + "step": 13256 + }, + { + "epoch": 1.6864266632743927, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.5420031547546387, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.872305691242218, + "num_tokens": 505681929.0, + "step": 13257 + }, + { + "epoch": 1.686553873552983, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.068427801132202, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8414183855056763, + "num_tokens": 505719373.0, + "step": 13258 + }, + { + "epoch": 1.6866810838315736, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9938277006149292, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8475910425186157, + "num_tokens": 505757767.0, + "step": 13259 + }, + { + "epoch": 1.686808294110164, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0607106685638428, + "learning_rate": 1e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8321148157119751, + "num_tokens": 505794777.0, + "step": 13260 + }, + { + "epoch": 1.6869355043887546, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1596128940582275, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8697683215141296, + "num_tokens": 505832672.0, + "step": 13261 + }, + { + "epoch": 1.6870627146673451, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8568748235702515, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8514313697814941, + "num_tokens": 505873142.0, + "step": 13262 + }, + { + "epoch": 1.6871899249459357, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8212792873382568, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8670006990432739, + "num_tokens": 505911994.0, + "step": 13263 + }, + { + "epoch": 1.687317135224526, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8041263818740845, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8673778176307678, + "num_tokens": 505948874.0, + "step": 13264 + }, + { + "epoch": 1.6874443455031165, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8364510536193848, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8690873384475708, + "num_tokens": 505989679.0, + "step": 13265 + }, + { + "epoch": 1.687571555781707, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.797332525253296, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8709839582443237, + "num_tokens": 506033209.0, + "step": 13266 + }, + { + "epoch": 1.6876987660602976, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9449669122695923, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8725250959396362, + "num_tokens": 506065379.0, + "step": 13267 + }, + { + "epoch": 1.687825976338888, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9244269132614136, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8549387454986572, + "num_tokens": 506099484.0, + "step": 13268 + }, + { + "epoch": 1.6879531866174786, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.92106294631958, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8465163707733154, + "num_tokens": 506136559.0, + "step": 13269 + }, + { + "epoch": 1.6880803968960691, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0456650257110596, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8615611791610718, + "num_tokens": 506172997.0, + "step": 13270 + }, + { + "epoch": 1.6882076071746597, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7575159072875977, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.877684473991394, + "num_tokens": 506209858.0, + "step": 13271 + }, + { + "epoch": 1.6883348174532502, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9420254230499268, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8745197057723999, + "num_tokens": 506242260.0, + "step": 13272 + }, + { + "epoch": 1.6884620277318407, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0096418857574463, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8495442867279053, + "num_tokens": 506276856.0, + "step": 13273 + }, + { + "epoch": 1.6885892380104313, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7795867919921875, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.881460428237915, + "num_tokens": 506311469.0, + "step": 13274 + }, + { + "epoch": 1.6887164482890218, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8444794416427612, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8724857568740845, + "num_tokens": 506348240.0, + "step": 13275 + }, + { + "epoch": 1.6888436585676123, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9389773607254028, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8650714755058289, + "num_tokens": 506387203.0, + "step": 13276 + }, + { + "epoch": 1.6889708688462028, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0542004108428955, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8405885696411133, + "num_tokens": 506421869.0, + "step": 13277 + }, + { + "epoch": 1.6890980791247934, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0704267024993896, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8640497922897339, + "num_tokens": 506458873.0, + "step": 13278 + }, + { + "epoch": 1.689225289403384, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9092347621917725, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8517435789108276, + "num_tokens": 506495947.0, + "step": 13279 + }, + { + "epoch": 1.6893524996819744, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.901106595993042, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8462554216384888, + "num_tokens": 506533098.0, + "step": 13280 + }, + { + "epoch": 1.689479709960565, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6877870559692383, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8742188215255737, + "num_tokens": 506571346.0, + "step": 13281 + }, + { + "epoch": 1.6896069202391555, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.994478464126587, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8660356998443604, + "num_tokens": 506607912.0, + "step": 13282 + }, + { + "epoch": 1.6897341305177458, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8349188566207886, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8527172803878784, + "num_tokens": 506654258.0, + "step": 13283 + }, + { + "epoch": 1.6898613407963363, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9634006023406982, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.870123028755188, + "num_tokens": 506693436.0, + "step": 13284 + }, + { + "epoch": 1.6899885510749268, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7791651487350464, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.865876317024231, + "num_tokens": 506737383.0, + "step": 13285 + }, + { + "epoch": 1.6901157613535174, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.6428489685058594, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.873231053352356, + "num_tokens": 506772282.0, + "step": 13286 + }, + { + "epoch": 1.690242971632108, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9649139642715454, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8745977282524109, + "num_tokens": 506807208.0, + "step": 13287 + }, + { + "epoch": 1.6903701819106984, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8674482107162476, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8703757524490356, + "num_tokens": 506847952.0, + "step": 13288 + }, + { + "epoch": 1.6904973921892887, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.069525957107544, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8668798208236694, + "num_tokens": 506888265.0, + "step": 13289 + }, + { + "epoch": 1.6906246024678793, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.086792230606079, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8674476146697998, + "num_tokens": 506926096.0, + "step": 13290 + }, + { + "epoch": 1.6907518127464698, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.826356291770935, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8563007116317749, + "num_tokens": 506962252.0, + "step": 13291 + }, + { + "epoch": 1.6908790230250603, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8033168315887451, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8686286211013794, + "num_tokens": 507004709.0, + "step": 13292 + }, + { + "epoch": 1.6910062333036509, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8551294803619385, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8799095153808594, + "num_tokens": 507041041.0, + "step": 13293 + }, + { + "epoch": 1.6911334435822414, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8349095582962036, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.857821524143219, + "num_tokens": 507080251.0, + "step": 13294 + }, + { + "epoch": 1.691260653860832, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7613024711608887, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8638975620269775, + "num_tokens": 507120813.0, + "step": 13295 + }, + { + "epoch": 1.6913878641394224, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9655715227127075, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8515098094940186, + "num_tokens": 507160284.0, + "step": 13296 + }, + { + "epoch": 1.691515074418013, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8761042356491089, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8559293746948242, + "num_tokens": 507205069.0, + "step": 13297 + }, + { + "epoch": 1.6916422846966035, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8213040828704834, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8728146553039551, + "num_tokens": 507244689.0, + "step": 13298 + }, + { + "epoch": 1.691769494975194, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.894193410873413, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8636202216148376, + "num_tokens": 507283917.0, + "step": 13299 + }, + { + "epoch": 1.6918967052537845, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6493549346923828, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8661049604415894, + "num_tokens": 507327937.0, + "step": 13300 + }, + { + "epoch": 1.692023915532375, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8299058675765991, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8763799071311951, + "num_tokens": 507369521.0, + "step": 13301 + }, + { + "epoch": 1.6921511258109656, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7601631879806519, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8734354376792908, + "num_tokens": 507408157.0, + "step": 13302 + }, + { + "epoch": 1.6922783360895561, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.890634536743164, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8702181577682495, + "num_tokens": 507448003.0, + "step": 13303 + }, + { + "epoch": 1.6924055463681467, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.78190016746521, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8721569776535034, + "num_tokens": 507489970.0, + "step": 13304 + }, + { + "epoch": 1.6925327566467372, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.901701807975769, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8654296398162842, + "num_tokens": 507524536.0, + "step": 13305 + }, + { + "epoch": 1.6926599669253277, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1935765743255615, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8846617937088013, + "num_tokens": 507561575.0, + "step": 13306 + }, + { + "epoch": 1.692787177203918, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.136444091796875, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8652599453926086, + "num_tokens": 507597212.0, + "step": 13307 + }, + { + "epoch": 1.6929143874825086, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9725730419158936, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8675298094749451, + "num_tokens": 507634251.0, + "step": 13308 + }, + { + "epoch": 1.693041597761099, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9202923774719238, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8702007532119751, + "num_tokens": 507671345.0, + "step": 13309 + }, + { + "epoch": 1.6931688080396896, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1492538452148438, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8537193536758423, + "num_tokens": 507709797.0, + "step": 13310 + }, + { + "epoch": 1.6932960183182801, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9257389307022095, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8638911843299866, + "num_tokens": 507747047.0, + "step": 13311 + }, + { + "epoch": 1.6934232285968707, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8867597579956055, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8634786605834961, + "num_tokens": 507782187.0, + "step": 13312 + }, + { + "epoch": 1.693550438875461, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8198119401931763, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8509407043457031, + "num_tokens": 507826714.0, + "step": 13313 + }, + { + "epoch": 1.6936776491540515, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7568917274475098, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8555155396461487, + "num_tokens": 507869423.0, + "step": 13314 + }, + { + "epoch": 1.693804859432642, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8554002046585083, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8744891881942749, + "num_tokens": 507904711.0, + "step": 13315 + }, + { + "epoch": 1.6939320697112326, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8988840579986572, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8593380451202393, + "num_tokens": 507947760.0, + "step": 13316 + }, + { + "epoch": 1.694059279989823, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8571884632110596, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8608457446098328, + "num_tokens": 507984249.0, + "step": 13317 + }, + { + "epoch": 1.6941864902684136, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1195900440216064, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8609337210655212, + "num_tokens": 508017206.0, + "step": 13318 + }, + { + "epoch": 1.6943137005470041, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8899976015090942, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8580865263938904, + "num_tokens": 508056357.0, + "step": 13319 + }, + { + "epoch": 1.6944409108255947, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8621777296066284, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8558807373046875, + "num_tokens": 508094870.0, + "step": 13320 + }, + { + "epoch": 1.6945681211041852, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.11802077293396, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8635871410369873, + "num_tokens": 508120323.0, + "step": 13321 + }, + { + "epoch": 1.6946953313827757, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8524260520935059, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8556634783744812, + "num_tokens": 508160018.0, + "step": 13322 + }, + { + "epoch": 1.6948225416613663, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7684043645858765, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8824912309646606, + "num_tokens": 508200947.0, + "step": 13323 + }, + { + "epoch": 1.6949497519399568, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7177152633666992, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8503985404968262, + "num_tokens": 508246457.0, + "step": 13324 + }, + { + "epoch": 1.6950769622185473, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9424126148223877, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.858026921749115, + "num_tokens": 508281951.0, + "step": 13325 + }, + { + "epoch": 1.6952041724971378, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9188352823257446, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8448469638824463, + "num_tokens": 508322571.0, + "step": 13326 + }, + { + "epoch": 1.6953313827757284, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.813658595085144, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8701560497283936, + "num_tokens": 508357347.0, + "step": 13327 + }, + { + "epoch": 1.695458593054319, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.695206642150879, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8660063147544861, + "num_tokens": 508401813.0, + "step": 13328 + }, + { + "epoch": 1.6955858033329094, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.020576238632202, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8519258499145508, + "num_tokens": 508440457.0, + "step": 13329 + }, + { + "epoch": 1.6957130136115, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.079054832458496, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8549677133560181, + "num_tokens": 508472272.0, + "step": 13330 + }, + { + "epoch": 1.6958402238900905, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9922305345535278, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8753482103347778, + "num_tokens": 508509352.0, + "step": 13331 + }, + { + "epoch": 1.6959674341686808, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8877756595611572, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8726212382316589, + "num_tokens": 508552515.0, + "step": 13332 + }, + { + "epoch": 1.6960946444472713, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8256980180740356, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.875577449798584, + "num_tokens": 508591399.0, + "step": 13333 + }, + { + "epoch": 1.6962218547258618, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0227932929992676, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8686296939849854, + "num_tokens": 508630427.0, + "step": 13334 + }, + { + "epoch": 1.6963490650044524, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.990153431892395, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8724732995033264, + "num_tokens": 508667469.0, + "step": 13335 + }, + { + "epoch": 1.696476275283043, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.886869192123413, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8686094880104065, + "num_tokens": 508706308.0, + "step": 13336 + }, + { + "epoch": 1.6966034855616334, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1977086067199707, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8518228530883789, + "num_tokens": 508738968.0, + "step": 13337 + }, + { + "epoch": 1.6967306958402237, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0691425800323486, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8708459734916687, + "num_tokens": 508769879.0, + "step": 13338 + }, + { + "epoch": 1.6968579061188143, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9188172817230225, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8497001528739929, + "num_tokens": 508812277.0, + "step": 13339 + }, + { + "epoch": 1.6969851163974048, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8781882524490356, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8545392155647278, + "num_tokens": 508849293.0, + "step": 13340 + }, + { + "epoch": 1.6971123266759953, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.93802011013031, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8658002614974976, + "num_tokens": 508885798.0, + "step": 13341 + }, + { + "epoch": 1.6972395369545858, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0049915313720703, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8619191646575928, + "num_tokens": 508920491.0, + "step": 13342 + }, + { + "epoch": 1.6973667472331764, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1336522102355957, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8627018928527832, + "num_tokens": 508956161.0, + "step": 13343 + }, + { + "epoch": 1.697493957511767, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.908514380455017, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8605102896690369, + "num_tokens": 508995696.0, + "step": 13344 + }, + { + "epoch": 1.6976211677903574, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.898030161857605, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8678879737854004, + "num_tokens": 509029505.0, + "step": 13345 + }, + { + "epoch": 1.697748378068948, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9217166900634766, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.870710015296936, + "num_tokens": 509062815.0, + "step": 13346 + }, + { + "epoch": 1.6978755883475385, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.8133796453475952, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.870773196220398, + "num_tokens": 509103376.0, + "step": 13347 + }, + { + "epoch": 1.698002798626129, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9947750568389893, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8728210926055908, + "num_tokens": 509142797.0, + "step": 13348 + }, + { + "epoch": 1.6981300089047195, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8058955669403076, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8522379398345947, + "num_tokens": 509182724.0, + "step": 13349 + }, + { + "epoch": 1.69825721918331, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.054286003112793, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8724609613418579, + "num_tokens": 509217856.0, + "step": 13350 + }, + { + "epoch": 1.6983844294619006, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9249885082244873, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8647600412368774, + "num_tokens": 509251581.0, + "step": 13351 + }, + { + "epoch": 1.6985116397404911, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.839067816734314, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8804666996002197, + "num_tokens": 509286687.0, + "step": 13352 + }, + { + "epoch": 1.6986388500190817, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.906903862953186, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8583508729934692, + "num_tokens": 509323372.0, + "step": 13353 + }, + { + "epoch": 1.6987660602976722, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9461127519607544, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8564286231994629, + "num_tokens": 509362472.0, + "step": 13354 + }, + { + "epoch": 1.6988932705762627, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9573248624801636, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8653831481933594, + "num_tokens": 509397676.0, + "step": 13355 + }, + { + "epoch": 1.699020480854853, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.777899980545044, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8672246932983398, + "num_tokens": 509441100.0, + "step": 13356 + }, + { + "epoch": 1.6991476911334435, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8204933404922485, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8602165579795837, + "num_tokens": 509485430.0, + "step": 13357 + }, + { + "epoch": 1.699274901412034, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9529671669006348, + "learning_rate": 1e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8349826335906982, + "num_tokens": 509520102.0, + "step": 13358 + }, + { + "epoch": 1.6994021116906246, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9359149932861328, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8649495840072632, + "num_tokens": 509556525.0, + "step": 13359 + }, + { + "epoch": 1.6995293219692151, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9922258853912354, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8564570546150208, + "num_tokens": 509596899.0, + "step": 13360 + }, + { + "epoch": 1.6996565322478057, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.036339044570923, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8655390739440918, + "num_tokens": 509632125.0, + "step": 13361 + }, + { + "epoch": 1.699783742526396, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9133920669555664, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8585610389709473, + "num_tokens": 509668131.0, + "step": 13362 + }, + { + "epoch": 1.6999109528049865, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7516452074050903, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8771207928657532, + "num_tokens": 509704078.0, + "step": 13363 + }, + { + "epoch": 1.700038163083577, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9356623888015747, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8765509128570557, + "num_tokens": 509739464.0, + "step": 13364 + }, + { + "epoch": 1.7001653733621676, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8837898969650269, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8637211322784424, + "num_tokens": 509777970.0, + "step": 13365 + }, + { + "epoch": 1.700292583640758, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.80272376537323, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8506719470024109, + "num_tokens": 509817010.0, + "step": 13366 + }, + { + "epoch": 1.7004197939193486, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.5964510440826416, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8691859245300293, + "num_tokens": 509854192.0, + "step": 13367 + }, + { + "epoch": 1.7005470041979391, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9333535432815552, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8776557445526123, + "num_tokens": 509888633.0, + "step": 13368 + }, + { + "epoch": 1.7006742144765297, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8611027002334595, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8662989139556885, + "num_tokens": 509928474.0, + "step": 13369 + }, + { + "epoch": 1.7008014247551202, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.068847179412842, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8563172817230225, + "num_tokens": 509968875.0, + "step": 13370 + }, + { + "epoch": 1.7009286350337107, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.147799491882324, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8643868565559387, + "num_tokens": 510006021.0, + "step": 13371 + }, + { + "epoch": 1.7010558453123013, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.856946349143982, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.868971586227417, + "num_tokens": 510042111.0, + "step": 13372 + }, + { + "epoch": 1.7011830555908918, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.108943223953247, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8704063892364502, + "num_tokens": 510074894.0, + "step": 13373 + }, + { + "epoch": 1.7013102658694823, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0597283840179443, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8560448884963989, + "num_tokens": 510111516.0, + "step": 13374 + }, + { + "epoch": 1.7014374761480728, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7784090042114258, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8677961826324463, + "num_tokens": 510151549.0, + "step": 13375 + }, + { + "epoch": 1.7015646864266634, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8484554290771484, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.86392742395401, + "num_tokens": 510189215.0, + "step": 13376 + }, + { + "epoch": 1.701691896705254, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6200335025787354, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8838626146316528, + "num_tokens": 510231189.0, + "step": 13377 + }, + { + "epoch": 1.7018191069838444, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6818126440048218, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.868660032749176, + "num_tokens": 510273956.0, + "step": 13378 + }, + { + "epoch": 1.701946317262435, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7474781274795532, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8799289464950562, + "num_tokens": 510315095.0, + "step": 13379 + }, + { + "epoch": 1.7020735275410255, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.927450180053711, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8423994779586792, + "num_tokens": 510350753.0, + "step": 13380 + }, + { + "epoch": 1.7022007378196158, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9080723524093628, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8703833818435669, + "num_tokens": 510389257.0, + "step": 13381 + }, + { + "epoch": 1.7023279480982063, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9635688066482544, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8658493757247925, + "num_tokens": 510428879.0, + "step": 13382 + }, + { + "epoch": 1.7024551583767968, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8739186525344849, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8666598796844482, + "num_tokens": 510468409.0, + "step": 13383 + }, + { + "epoch": 1.7025823686553874, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9004027843475342, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8799113035202026, + "num_tokens": 510503988.0, + "step": 13384 + }, + { + "epoch": 1.702709578933978, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8917309045791626, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8620271682739258, + "num_tokens": 510539009.0, + "step": 13385 + }, + { + "epoch": 1.7028367892125684, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8640005588531494, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8789425492286682, + "num_tokens": 510576177.0, + "step": 13386 + }, + { + "epoch": 1.7029639994911587, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.70551598072052, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8641444444656372, + "num_tokens": 510622194.0, + "step": 13387 + }, + { + "epoch": 1.7030912097697493, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.723884105682373, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8813589215278625, + "num_tokens": 510663564.0, + "step": 13388 + }, + { + "epoch": 1.7032184200483398, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9839398860931396, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8746405839920044, + "num_tokens": 510702153.0, + "step": 13389 + }, + { + "epoch": 1.7033456303269303, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7876256704330444, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8602246046066284, + "num_tokens": 510742812.0, + "step": 13390 + }, + { + "epoch": 1.7034728406055208, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.103942632675171, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.858960747718811, + "num_tokens": 510783363.0, + "step": 13391 + }, + { + "epoch": 1.7036000508841114, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9975521564483643, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8493866324424744, + "num_tokens": 510818362.0, + "step": 13392 + }, + { + "epoch": 1.703727261162702, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9502558708190918, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8752431273460388, + "num_tokens": 510854732.0, + "step": 13393 + }, + { + "epoch": 1.7038544714412924, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8790020942687988, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8581748008728027, + "num_tokens": 510895107.0, + "step": 13394 + }, + { + "epoch": 1.703981681719883, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.048793315887451, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8602443933486938, + "num_tokens": 510930737.0, + "step": 13395 + }, + { + "epoch": 1.7041088919984735, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9019464254379272, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.872429370880127, + "num_tokens": 510964646.0, + "step": 13396 + }, + { + "epoch": 1.704236102277064, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9105600118637085, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8677672147750854, + "num_tokens": 511000583.0, + "step": 13397 + }, + { + "epoch": 1.7043633125556545, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.998143196105957, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8668022751808167, + "num_tokens": 511033729.0, + "step": 13398 + }, + { + "epoch": 1.704490522834245, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7947787046432495, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.869850754737854, + "num_tokens": 511074099.0, + "step": 13399 + }, + { + "epoch": 1.7046177331128356, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0594472885131836, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8534078598022461, + "num_tokens": 511112473.0, + "step": 13400 + }, + { + "epoch": 1.7047449433914261, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.938310980796814, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8641920685768127, + "num_tokens": 511149839.0, + "step": 13401 + }, + { + "epoch": 1.7048721536700167, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.047672986984253, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8546626567840576, + "num_tokens": 511183054.0, + "step": 13402 + }, + { + "epoch": 1.7049993639486072, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9782108068466187, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8685392737388611, + "num_tokens": 511218498.0, + "step": 13403 + }, + { + "epoch": 1.7051265742271977, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8256608247756958, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8608344197273254, + "num_tokens": 511259776.0, + "step": 13404 + }, + { + "epoch": 1.705253784505788, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9799972772598267, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8712831735610962, + "num_tokens": 511297331.0, + "step": 13405 + }, + { + "epoch": 1.7053809947843785, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9518828392028809, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8764305114746094, + "num_tokens": 511331055.0, + "step": 13406 + }, + { + "epoch": 1.705508205062969, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9928275346755981, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8815450668334961, + "num_tokens": 511368651.0, + "step": 13407 + }, + { + "epoch": 1.7056354153415596, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.060246706008911, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8692119121551514, + "num_tokens": 511402163.0, + "step": 13408 + }, + { + "epoch": 1.7057626256201501, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7476415634155273, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8574105501174927, + "num_tokens": 511443769.0, + "step": 13409 + }, + { + "epoch": 1.7058898358987407, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8528062105178833, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8622455596923828, + "num_tokens": 511481038.0, + "step": 13410 + }, + { + "epoch": 1.706017046177331, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7952605485916138, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8680052757263184, + "num_tokens": 511522940.0, + "step": 13411 + }, + { + "epoch": 1.7061442564559215, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8158611059188843, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8622941970825195, + "num_tokens": 511562630.0, + "step": 13412 + }, + { + "epoch": 1.706271466734512, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.91281259059906, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8494110107421875, + "num_tokens": 511598986.0, + "step": 13413 + }, + { + "epoch": 1.7063986770131025, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2278189659118652, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8661893606185913, + "num_tokens": 511633582.0, + "step": 13414 + }, + { + "epoch": 1.706525887291693, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.751145362854004, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8645824193954468, + "num_tokens": 511673296.0, + "step": 13415 + }, + { + "epoch": 1.7066530975702836, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.02705454826355, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8567851185798645, + "num_tokens": 511707890.0, + "step": 13416 + }, + { + "epoch": 1.7067803078488741, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9292798042297363, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8713939189910889, + "num_tokens": 511741597.0, + "step": 13417 + }, + { + "epoch": 1.7069075181274647, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9235022068023682, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8675938844680786, + "num_tokens": 511776816.0, + "step": 13418 + }, + { + "epoch": 1.7070347284060552, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1210405826568604, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8584989309310913, + "num_tokens": 511814866.0, + "step": 13419 + }, + { + "epoch": 1.7071619386846457, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8203412294387817, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8650407791137695, + "num_tokens": 511852458.0, + "step": 13420 + }, + { + "epoch": 1.7072891489632362, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8289228677749634, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8589690923690796, + "num_tokens": 511891201.0, + "step": 13421 + }, + { + "epoch": 1.7074163592418268, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7940598726272583, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8643752336502075, + "num_tokens": 511930862.0, + "step": 13422 + }, + { + "epoch": 1.7075435695204173, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.757147192955017, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8720401525497437, + "num_tokens": 511967695.0, + "step": 13423 + }, + { + "epoch": 1.7076707797990078, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7388801574707031, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8791848421096802, + "num_tokens": 512005976.0, + "step": 13424 + }, + { + "epoch": 1.7077979900775984, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.961098551750183, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8654127717018127, + "num_tokens": 512041575.0, + "step": 13425 + }, + { + "epoch": 1.7079252003561889, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9551172256469727, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8597097396850586, + "num_tokens": 512076171.0, + "step": 13426 + }, + { + "epoch": 1.7080524106347794, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.820455551147461, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8617881536483765, + "num_tokens": 512112082.0, + "step": 13427 + }, + { + "epoch": 1.70817962091337, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0222692489624023, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8510386943817139, + "num_tokens": 512149126.0, + "step": 13428 + }, + { + "epoch": 1.7083068311919605, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2073943614959717, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8654136657714844, + "num_tokens": 512182368.0, + "step": 13429 + }, + { + "epoch": 1.7084340414705508, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1311123371124268, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8446168899536133, + "num_tokens": 512218224.0, + "step": 13430 + }, + { + "epoch": 1.7085612517491413, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8792120218276978, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8637530207633972, + "num_tokens": 512255523.0, + "step": 13431 + }, + { + "epoch": 1.7086884620277318, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8665897846221924, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8745722770690918, + "num_tokens": 512290514.0, + "step": 13432 + }, + { + "epoch": 1.7088156723063224, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7911118268966675, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8731129765510559, + "num_tokens": 512328468.0, + "step": 13433 + }, + { + "epoch": 1.708942882584913, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.974453330039978, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8620728254318237, + "num_tokens": 512359786.0, + "step": 13434 + }, + { + "epoch": 1.7090700928635034, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.955859899520874, + "learning_rate": 1e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8427488207817078, + "num_tokens": 512395679.0, + "step": 13435 + }, + { + "epoch": 1.7091973031420937, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7983286380767822, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8622440695762634, + "num_tokens": 512433491.0, + "step": 13436 + }, + { + "epoch": 1.7093245134206843, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9494694471359253, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8582781553268433, + "num_tokens": 512470215.0, + "step": 13437 + }, + { + "epoch": 1.7094517236992748, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8812569379806519, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8608853220939636, + "num_tokens": 512507055.0, + "step": 13438 + }, + { + "epoch": 1.7095789339778653, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.11742901802063, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8471349477767944, + "num_tokens": 512547877.0, + "step": 13439 + }, + { + "epoch": 1.7097061442564558, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1383848190307617, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8504996299743652, + "num_tokens": 512586558.0, + "step": 13440 + }, + { + "epoch": 1.7098333545350464, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7242332696914673, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8606854677200317, + "num_tokens": 512631181.0, + "step": 13441 + }, + { + "epoch": 1.709960564813637, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.800586462020874, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8744773864746094, + "num_tokens": 512667925.0, + "step": 13442 + }, + { + "epoch": 1.7100877750922274, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.931255578994751, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8475391268730164, + "num_tokens": 512703136.0, + "step": 13443 + }, + { + "epoch": 1.710214985370818, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9275449514389038, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8656723499298096, + "num_tokens": 512739001.0, + "step": 13444 + }, + { + "epoch": 1.7103421956494085, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6941238641738892, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8681737184524536, + "num_tokens": 512780517.0, + "step": 13445 + }, + { + "epoch": 1.710469405927999, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.81589937210083, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8720561861991882, + "num_tokens": 512815964.0, + "step": 13446 + }, + { + "epoch": 1.7105966162065895, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8684227466583252, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8667929172515869, + "num_tokens": 512855451.0, + "step": 13447 + }, + { + "epoch": 1.71072382648518, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8653019666671753, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8680872917175293, + "num_tokens": 512895809.0, + "step": 13448 + }, + { + "epoch": 1.7108510367637706, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.6743861436843872, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8693623542785645, + "num_tokens": 512936424.0, + "step": 13449 + }, + { + "epoch": 1.7109782470423611, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7804760932922363, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8791191577911377, + "num_tokens": 512976486.0, + "step": 13450 + }, + { + "epoch": 1.7111054573209517, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9928603172302246, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8542349338531494, + "num_tokens": 513014131.0, + "step": 13451 + }, + { + "epoch": 1.7112326675995422, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.852317452430725, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8364343643188477, + "num_tokens": 513054706.0, + "step": 13452 + }, + { + "epoch": 1.7113598778781327, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.544158697128296, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8699691295623779, + "num_tokens": 513094089.0, + "step": 13453 + }, + { + "epoch": 1.711487088156723, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.243722677230835, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8616363406181335, + "num_tokens": 513136306.0, + "step": 13454 + }, + { + "epoch": 1.7116142984353135, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9452447891235352, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8594274520874023, + "num_tokens": 513174376.0, + "step": 13455 + }, + { + "epoch": 1.711741508713904, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8971984386444092, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8671006560325623, + "num_tokens": 513211834.0, + "step": 13456 + }, + { + "epoch": 1.7118687189924946, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0427122116088867, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8553476333618164, + "num_tokens": 513241765.0, + "step": 13457 + }, + { + "epoch": 1.7119959292710851, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.934815764427185, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8512371778488159, + "num_tokens": 513283510.0, + "step": 13458 + }, + { + "epoch": 1.7121231395496757, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9216293096542358, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.862316370010376, + "num_tokens": 513320746.0, + "step": 13459 + }, + { + "epoch": 1.712250349828266, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.834977149963379, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8591576814651489, + "num_tokens": 513358036.0, + "step": 13460 + }, + { + "epoch": 1.7123775601068565, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8458242416381836, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8692271709442139, + "num_tokens": 513395156.0, + "step": 13461 + }, + { + "epoch": 1.712504770385447, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8278520107269287, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8593963384628296, + "num_tokens": 513438631.0, + "step": 13462 + }, + { + "epoch": 1.7126319806640375, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.285397529602051, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8534132242202759, + "num_tokens": 513471225.0, + "step": 13463 + }, + { + "epoch": 1.712759190942628, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.886902093887329, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8553324937820435, + "num_tokens": 513513824.0, + "step": 13464 + }, + { + "epoch": 1.7128864012212186, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8878023624420166, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8564549088478088, + "num_tokens": 513556263.0, + "step": 13465 + }, + { + "epoch": 1.7130136114998091, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.853783130645752, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8673815727233887, + "num_tokens": 513595093.0, + "step": 13466 + }, + { + "epoch": 1.7131408217783997, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8967798948287964, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8560285568237305, + "num_tokens": 513633761.0, + "step": 13467 + }, + { + "epoch": 1.7132680320569902, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9343489408493042, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8601387143135071, + "num_tokens": 513671404.0, + "step": 13468 + }, + { + "epoch": 1.7133952423355807, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7796058654785156, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8623345494270325, + "num_tokens": 513713752.0, + "step": 13469 + }, + { + "epoch": 1.7135224526141712, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2672858238220215, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8592268228530884, + "num_tokens": 513755261.0, + "step": 13470 + }, + { + "epoch": 1.7136496628927618, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7117398977279663, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8678796291351318, + "num_tokens": 513799042.0, + "step": 13471 + }, + { + "epoch": 1.7137768731713523, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9092355966567993, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8531183004379272, + "num_tokens": 513838148.0, + "step": 13472 + }, + { + "epoch": 1.7139040834499428, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.746461033821106, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8708289861679077, + "num_tokens": 513878960.0, + "step": 13473 + }, + { + "epoch": 1.7140312937285334, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.202319860458374, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8657095432281494, + "num_tokens": 513914083.0, + "step": 13474 + }, + { + "epoch": 1.7141585040071239, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0297250747680664, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8523117303848267, + "num_tokens": 513951591.0, + "step": 13475 + }, + { + "epoch": 1.7142857142857144, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9449288845062256, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8495994210243225, + "num_tokens": 513986127.0, + "step": 13476 + }, + { + "epoch": 1.714412924564305, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.803958535194397, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8583502769470215, + "num_tokens": 514028248.0, + "step": 13477 + }, + { + "epoch": 1.7145401348428955, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0248968601226807, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8657916188240051, + "num_tokens": 514062757.0, + "step": 13478 + }, + { + "epoch": 1.7146673451214858, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8931496143341064, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8503898978233337, + "num_tokens": 514102635.0, + "step": 13479 + }, + { + "epoch": 1.7147945554000763, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.961745023727417, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.862089991569519, + "num_tokens": 514140883.0, + "step": 13480 + }, + { + "epoch": 1.7149217656786668, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.868342399597168, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8731414675712585, + "num_tokens": 514176536.0, + "step": 13481 + }, + { + "epoch": 1.7150489759572574, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9344885349273682, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8588738441467285, + "num_tokens": 514209969.0, + "step": 13482 + }, + { + "epoch": 1.7151761862358479, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8428847789764404, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8625985980033875, + "num_tokens": 514249679.0, + "step": 13483 + }, + { + "epoch": 1.7153033965144384, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9365997314453125, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8566242456436157, + "num_tokens": 514292719.0, + "step": 13484 + }, + { + "epoch": 1.7154306067930287, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.155498504638672, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8572021126747131, + "num_tokens": 514323396.0, + "step": 13485 + }, + { + "epoch": 1.7155578170716193, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.8301199674606323, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8691991567611694, + "num_tokens": 514360951.0, + "step": 13486 + }, + { + "epoch": 1.7156850273502098, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9830480813980103, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8482519388198853, + "num_tokens": 514403879.0, + "step": 13487 + }, + { + "epoch": 1.7158122376288003, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8178491592407227, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8705160617828369, + "num_tokens": 514446628.0, + "step": 13488 + }, + { + "epoch": 1.7159394479073908, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8710769414901733, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8630702495574951, + "num_tokens": 514486132.0, + "step": 13489 + }, + { + "epoch": 1.7160666581859814, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9325357675552368, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8516491651535034, + "num_tokens": 514526384.0, + "step": 13490 + }, + { + "epoch": 1.716193868464572, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.007955312728882, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.860887348651886, + "num_tokens": 514565828.0, + "step": 13491 + }, + { + "epoch": 1.7163210787431624, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.150599241256714, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8590385913848877, + "num_tokens": 514605260.0, + "step": 13492 + }, + { + "epoch": 1.716448289021753, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9058990478515625, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8639168739318848, + "num_tokens": 514643433.0, + "step": 13493 + }, + { + "epoch": 1.7165754993003435, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.16864275932312, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8656922578811646, + "num_tokens": 514683264.0, + "step": 13494 + }, + { + "epoch": 1.716702709578934, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.8832868337631226, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8709055185317993, + "num_tokens": 514719775.0, + "step": 13495 + }, + { + "epoch": 1.7168299198575245, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7968668937683105, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8561761379241943, + "num_tokens": 514766287.0, + "step": 13496 + }, + { + "epoch": 1.716957130136115, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8096288442611694, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8730512857437134, + "num_tokens": 514806725.0, + "step": 13497 + }, + { + "epoch": 1.7170843404147056, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7948851585388184, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8665345311164856, + "num_tokens": 514844008.0, + "step": 13498 + }, + { + "epoch": 1.7172115506932961, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9480786323547363, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8507920503616333, + "num_tokens": 514880356.0, + "step": 13499 + }, + { + "epoch": 1.7173387609718866, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.806574821472168, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.875105619430542, + "num_tokens": 514915309.0, + "step": 13500 + }, + { + "epoch": 1.7174659712504772, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8713181018829346, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8563268184661865, + "num_tokens": 514956496.0, + "step": 13501 + }, + { + "epoch": 1.7175931815290677, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8408448696136475, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8639825582504272, + "num_tokens": 514992247.0, + "step": 13502 + }, + { + "epoch": 1.717720391807658, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7003777027130127, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8557435870170593, + "num_tokens": 515035443.0, + "step": 13503 + }, + { + "epoch": 1.7178476020862485, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9318506717681885, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8681900501251221, + "num_tokens": 515072279.0, + "step": 13504 + }, + { + "epoch": 1.717974812364839, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.015002965927124, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8611288070678711, + "num_tokens": 515112547.0, + "step": 13505 + }, + { + "epoch": 1.7181020226434296, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.918992280960083, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8504051566123962, + "num_tokens": 515146814.0, + "step": 13506 + }, + { + "epoch": 1.7182292329220201, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8550058603286743, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8654568195343018, + "num_tokens": 515184393.0, + "step": 13507 + }, + { + "epoch": 1.7183564432006107, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1607773303985596, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8566598892211914, + "num_tokens": 515225312.0, + "step": 13508 + }, + { + "epoch": 1.718483653479201, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1033127307891846, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8410782217979431, + "num_tokens": 515261330.0, + "step": 13509 + }, + { + "epoch": 1.7186108637577915, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 3.0081944465637207, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8688493967056274, + "num_tokens": 515299325.0, + "step": 13510 + }, + { + "epoch": 1.718738074036382, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.0755372047424316, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8640246391296387, + "num_tokens": 515335213.0, + "step": 13511 + }, + { + "epoch": 1.7188652843149725, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.132302761077881, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8743822574615479, + "num_tokens": 515365699.0, + "step": 13512 + }, + { + "epoch": 1.718992494593563, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1246352195739746, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8713449239730835, + "num_tokens": 515396025.0, + "step": 13513 + }, + { + "epoch": 1.7191197048721536, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0254805088043213, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8644775152206421, + "num_tokens": 515429724.0, + "step": 13514 + }, + { + "epoch": 1.7192469151507441, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0338833332061768, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.859470009803772, + "num_tokens": 515465894.0, + "step": 13515 + }, + { + "epoch": 1.7193741254293347, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8952971696853638, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8830150961875916, + "num_tokens": 515498894.0, + "step": 13516 + }, + { + "epoch": 1.7195013357079252, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.813414454460144, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8624967336654663, + "num_tokens": 515538783.0, + "step": 13517 + }, + { + "epoch": 1.7196285459865157, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9571542739868164, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.862311840057373, + "num_tokens": 515573107.0, + "step": 13518 + }, + { + "epoch": 1.7197557562651062, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8175115585327148, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8657269477844238, + "num_tokens": 515614901.0, + "step": 13519 + }, + { + "epoch": 1.7198829665436968, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 3.022843360900879, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8654464483261108, + "num_tokens": 515654197.0, + "step": 13520 + }, + { + "epoch": 1.7200101768222873, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.838665246963501, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8818998336791992, + "num_tokens": 515697003.0, + "step": 13521 + }, + { + "epoch": 1.7201373871008778, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9702388048171997, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8591117858886719, + "num_tokens": 515737801.0, + "step": 13522 + }, + { + "epoch": 1.7202645973794684, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9599254131317139, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.854717493057251, + "num_tokens": 515774711.0, + "step": 13523 + }, + { + "epoch": 1.7203918076580589, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9499714374542236, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.855511486530304, + "num_tokens": 515811728.0, + "step": 13524 + }, + { + "epoch": 1.7205190179366494, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.899999737739563, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8695659637451172, + "num_tokens": 515844425.0, + "step": 13525 + }, + { + "epoch": 1.72064622821524, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8211653232574463, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8646585941314697, + "num_tokens": 515882439.0, + "step": 13526 + }, + { + "epoch": 1.7207734384938305, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0264222621917725, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8413244485855103, + "num_tokens": 515922168.0, + "step": 13527 + }, + { + "epoch": 1.7209006487724208, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8483150005340576, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8707566857337952, + "num_tokens": 515961840.0, + "step": 13528 + }, + { + "epoch": 1.7210278590510113, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7686269283294678, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8633303642272949, + "num_tokens": 515997586.0, + "step": 13529 + }, + { + "epoch": 1.7211550693296018, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9826135635375977, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8705208897590637, + "num_tokens": 516036660.0, + "step": 13530 + }, + { + "epoch": 1.7212822796081924, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0996084213256836, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8560842275619507, + "num_tokens": 516070846.0, + "step": 13531 + }, + { + "epoch": 1.7214094898867829, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.874265432357788, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8858421444892883, + "num_tokens": 516107027.0, + "step": 13532 + }, + { + "epoch": 1.7215367001653734, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9166330099105835, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8643004894256592, + "num_tokens": 516143971.0, + "step": 13533 + }, + { + "epoch": 1.7216639104439637, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.879286289215088, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8649328947067261, + "num_tokens": 516180617.0, + "step": 13534 + }, + { + "epoch": 1.7217911207225542, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8607213497161865, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8639724254608154, + "num_tokens": 516220371.0, + "step": 13535 + }, + { + "epoch": 1.7219183310011448, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7994393110275269, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.865944504737854, + "num_tokens": 516258642.0, + "step": 13536 + }, + { + "epoch": 1.7220455412797353, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.922386884689331, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8572994470596313, + "num_tokens": 516298971.0, + "step": 13537 + }, + { + "epoch": 1.7221727515583258, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.950954556465149, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8755006194114685, + "num_tokens": 516334457.0, + "step": 13538 + }, + { + "epoch": 1.7222999618369164, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.747787356376648, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8888092637062073, + "num_tokens": 516374585.0, + "step": 13539 + }, + { + "epoch": 1.7224271721155069, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.023374080657959, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.850009560585022, + "num_tokens": 516414180.0, + "step": 13540 + }, + { + "epoch": 1.7225543823940974, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.825941801071167, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8715437650680542, + "num_tokens": 516450189.0, + "step": 13541 + }, + { + "epoch": 1.722681592672688, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8884062767028809, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8620634078979492, + "num_tokens": 516487549.0, + "step": 13542 + }, + { + "epoch": 1.7228088029512785, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.946856141090393, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8500098586082458, + "num_tokens": 516526649.0, + "step": 13543 + }, + { + "epoch": 1.722936013229869, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9957315921783447, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8684054613113403, + "num_tokens": 516563829.0, + "step": 13544 + }, + { + "epoch": 1.7230632235084595, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8275808095932007, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.84589022397995, + "num_tokens": 516605179.0, + "step": 13545 + }, + { + "epoch": 1.72319043378705, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7879198789596558, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8618043661117554, + "num_tokens": 516641576.0, + "step": 13546 + }, + { + "epoch": 1.7233176440656406, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8552290201187134, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8604899644851685, + "num_tokens": 516678687.0, + "step": 13547 + }, + { + "epoch": 1.7234448543442311, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0031490325927734, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8587532043457031, + "num_tokens": 516711484.0, + "step": 13548 + }, + { + "epoch": 1.7235720646228216, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8145023584365845, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8590953946113586, + "num_tokens": 516750598.0, + "step": 13549 + }, + { + "epoch": 1.7236992749014122, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.80105459690094, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8708122968673706, + "num_tokens": 516793438.0, + "step": 13550 + }, + { + "epoch": 1.7238264851800027, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 20.457752227783203, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8783715963363647, + "num_tokens": 516828692.0, + "step": 13551 + }, + { + "epoch": 1.723953695458593, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.056377649307251, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.864979088306427, + "num_tokens": 516867592.0, + "step": 13552 + }, + { + "epoch": 1.7240809057371835, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.895948052406311, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8508115410804749, + "num_tokens": 516907668.0, + "step": 13553 + }, + { + "epoch": 1.724208116015774, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8734962940216064, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8606972694396973, + "num_tokens": 516944628.0, + "step": 13554 + }, + { + "epoch": 1.7243353262943646, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.715441346168518, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8579003810882568, + "num_tokens": 516989027.0, + "step": 13555 + }, + { + "epoch": 1.7244625365729551, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0273995399475098, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8590657711029053, + "num_tokens": 517024464.0, + "step": 13556 + }, + { + "epoch": 1.7245897468515456, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.775365948677063, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8677085638046265, + "num_tokens": 517067714.0, + "step": 13557 + }, + { + "epoch": 1.724716957130136, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2898616790771484, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8661602139472961, + "num_tokens": 517107564.0, + "step": 13558 + }, + { + "epoch": 1.7248441674087265, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.808224081993103, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8680213689804077, + "num_tokens": 517148240.0, + "step": 13559 + }, + { + "epoch": 1.724971377687317, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7893798351287842, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8575112819671631, + "num_tokens": 517187841.0, + "step": 13560 + }, + { + "epoch": 1.7250985879659075, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.855481505393982, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.848200261592865, + "num_tokens": 517225732.0, + "step": 13561 + }, + { + "epoch": 1.725225798244498, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0722763538360596, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8472391366958618, + "num_tokens": 517260423.0, + "step": 13562 + }, + { + "epoch": 1.7253530085230886, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7803393602371216, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8513150215148926, + "num_tokens": 517300862.0, + "step": 13563 + }, + { + "epoch": 1.7254802188016791, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8337324857711792, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8682427406311035, + "num_tokens": 517341010.0, + "step": 13564 + }, + { + "epoch": 1.7256074290802697, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8896785974502563, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8463872671127319, + "num_tokens": 517376853.0, + "step": 13565 + }, + { + "epoch": 1.7257346393588602, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6923457384109497, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8570598363876343, + "num_tokens": 517421854.0, + "step": 13566 + }, + { + "epoch": 1.7258618496374507, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8049299716949463, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8570283651351929, + "num_tokens": 517461711.0, + "step": 13567 + }, + { + "epoch": 1.7259890599160412, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9066663980484009, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8698459267616272, + "num_tokens": 517494755.0, + "step": 13568 + }, + { + "epoch": 1.7261162701946318, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9141483306884766, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8550757765769958, + "num_tokens": 517539061.0, + "step": 13569 + }, + { + "epoch": 1.7262434804732223, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8311309814453125, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8590689897537231, + "num_tokens": 517579768.0, + "step": 13570 + }, + { + "epoch": 1.7263706907518128, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9285417795181274, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8748833537101746, + "num_tokens": 517612006.0, + "step": 13571 + }, + { + "epoch": 1.7264979010304033, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.951277256011963, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8626578450202942, + "num_tokens": 517647547.0, + "step": 13572 + }, + { + "epoch": 1.7266251113089939, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.005157232284546, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8534587621688843, + "num_tokens": 517689067.0, + "step": 13573 + }, + { + "epoch": 1.7267523215875844, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.05026912689209, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8670991063117981, + "num_tokens": 517729967.0, + "step": 13574 + }, + { + "epoch": 1.726879531866175, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8784579038619995, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8635149002075195, + "num_tokens": 517770427.0, + "step": 13575 + }, + { + "epoch": 1.7270067421447655, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.059856653213501, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8603039979934692, + "num_tokens": 517804202.0, + "step": 13576 + }, + { + "epoch": 1.7271339524233558, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9846878051757812, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8557708263397217, + "num_tokens": 517834825.0, + "step": 13577 + }, + { + "epoch": 1.7272611627019463, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8779412508010864, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8552619814872742, + "num_tokens": 517874600.0, + "step": 13578 + }, + { + "epoch": 1.7273883729805368, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.948538064956665, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8538181781768799, + "num_tokens": 517913488.0, + "step": 13579 + }, + { + "epoch": 1.7275155832591274, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7911784648895264, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8760467171669006, + "num_tokens": 517951094.0, + "step": 13580 + }, + { + "epoch": 1.7276427935377179, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7427674531936646, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8523401021957397, + "num_tokens": 517993811.0, + "step": 13581 + }, + { + "epoch": 1.7277700038163084, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7949705123901367, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.865253210067749, + "num_tokens": 518032860.0, + "step": 13582 + }, + { + "epoch": 1.7278972140948987, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0531275272369385, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.85733562707901, + "num_tokens": 518070256.0, + "step": 13583 + }, + { + "epoch": 1.7280244243734892, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4942398071289062, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8692277669906616, + "num_tokens": 518102828.0, + "step": 13584 + }, + { + "epoch": 1.7281516346520798, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7248090505599976, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8747848272323608, + "num_tokens": 518146859.0, + "step": 13585 + }, + { + "epoch": 1.7282788449306703, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8104580640792847, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8691889047622681, + "num_tokens": 518186173.0, + "step": 13586 + }, + { + "epoch": 1.7284060552092608, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9784860610961914, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8470693230628967, + "num_tokens": 518221903.0, + "step": 13587 + }, + { + "epoch": 1.7285332654878514, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.024679660797119, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8524762392044067, + "num_tokens": 518258048.0, + "step": 13588 + }, + { + "epoch": 1.7286604757664419, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.062352418899536, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8548314571380615, + "num_tokens": 518299127.0, + "step": 13589 + }, + { + "epoch": 1.7287876860450324, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8981890678405762, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8736565113067627, + "num_tokens": 518336520.0, + "step": 13590 + }, + { + "epoch": 1.728914896323623, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9817674160003662, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.859143853187561, + "num_tokens": 518368484.0, + "step": 13591 + }, + { + "epoch": 1.7290421066022135, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9054142236709595, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8595184087753296, + "num_tokens": 518408439.0, + "step": 13592 + }, + { + "epoch": 1.729169316880804, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.146578788757324, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8417230844497681, + "num_tokens": 518447414.0, + "step": 13593 + }, + { + "epoch": 1.7292965271593945, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0185301303863525, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8581993579864502, + "num_tokens": 518486455.0, + "step": 13594 + }, + { + "epoch": 1.729423737437985, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8376964330673218, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8798613548278809, + "num_tokens": 518527023.0, + "step": 13595 + }, + { + "epoch": 1.7295509477165756, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9953476190567017, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.855757474899292, + "num_tokens": 518563285.0, + "step": 13596 + }, + { + "epoch": 1.729678157995166, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8749076128005981, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8628534078598022, + "num_tokens": 518607610.0, + "step": 13597 + }, + { + "epoch": 1.7298053682737566, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9850605726242065, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8719155788421631, + "num_tokens": 518649434.0, + "step": 13598 + }, + { + "epoch": 1.7299325785523472, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9993808269500732, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8692026138305664, + "num_tokens": 518684508.0, + "step": 13599 + }, + { + "epoch": 1.7300597888309377, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8602206707000732, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.868909478187561, + "num_tokens": 518717883.0, + "step": 13600 + }, + { + "epoch": 1.730186999109528, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8461683988571167, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.868754506111145, + "num_tokens": 518756158.0, + "step": 13601 + }, + { + "epoch": 1.7303142093881185, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.277773380279541, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8627275228500366, + "num_tokens": 518793003.0, + "step": 13602 + }, + { + "epoch": 1.730441419666709, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8502039909362793, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.856378436088562, + "num_tokens": 518838485.0, + "step": 13603 + }, + { + "epoch": 1.7305686299452996, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7425241470336914, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8559900522232056, + "num_tokens": 518885936.0, + "step": 13604 + }, + { + "epoch": 1.7306958402238901, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9105570316314697, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8498520255088806, + "num_tokens": 518925684.0, + "step": 13605 + }, + { + "epoch": 1.7308230505024806, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.005603551864624, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.864284336566925, + "num_tokens": 518966438.0, + "step": 13606 + }, + { + "epoch": 1.730950260781071, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7080988883972168, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8647654056549072, + "num_tokens": 519013576.0, + "step": 13607 + }, + { + "epoch": 1.7310774710596615, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.048515796661377, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8602108955383301, + "num_tokens": 519045342.0, + "step": 13608 + }, + { + "epoch": 1.731204681338252, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7829586267471313, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8486946821212769, + "num_tokens": 519088566.0, + "step": 13609 + }, + { + "epoch": 1.7313318916168425, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8171149492263794, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8658296465873718, + "num_tokens": 519126516.0, + "step": 13610 + }, + { + "epoch": 1.731459101895433, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8151806592941284, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.87132728099823, + "num_tokens": 519165431.0, + "step": 13611 + }, + { + "epoch": 1.7315863121740236, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8265211582183838, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8780621290206909, + "num_tokens": 519200991.0, + "step": 13612 + }, + { + "epoch": 1.7317135224526141, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.689124345779419, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.869125485420227, + "num_tokens": 519245988.0, + "step": 13613 + }, + { + "epoch": 1.7318407327312046, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9532232284545898, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.867382287979126, + "num_tokens": 519282595.0, + "step": 13614 + }, + { + "epoch": 1.7319679430097952, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.8525846004486084, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8562215566635132, + "num_tokens": 519319061.0, + "step": 13615 + }, + { + "epoch": 1.7320951532883857, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9411319494247437, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8600325584411621, + "num_tokens": 519358617.0, + "step": 13616 + }, + { + "epoch": 1.7322223635669762, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9527848958969116, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.861568033695221, + "num_tokens": 519392461.0, + "step": 13617 + }, + { + "epoch": 1.7323495738455668, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7765378952026367, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8705921173095703, + "num_tokens": 519432893.0, + "step": 13618 + }, + { + "epoch": 1.7324767841241573, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.845094919204712, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8634622097015381, + "num_tokens": 519473445.0, + "step": 13619 + }, + { + "epoch": 1.7326039944027478, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9651347398757935, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8638635277748108, + "num_tokens": 519518883.0, + "step": 13620 + }, + { + "epoch": 1.7327312046813383, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1701598167419434, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8669003248214722, + "num_tokens": 519550123.0, + "step": 13621 + }, + { + "epoch": 1.7328584149599289, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7865073680877686, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8533379435539246, + "num_tokens": 519592538.0, + "step": 13622 + }, + { + "epoch": 1.7329856252385194, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.817855954170227, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8634994029998779, + "num_tokens": 519633466.0, + "step": 13623 + }, + { + "epoch": 1.73311283551711, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8139680624008179, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8587585687637329, + "num_tokens": 519672569.0, + "step": 13624 + }, + { + "epoch": 1.7332400457957005, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.934402585029602, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8686249256134033, + "num_tokens": 519707383.0, + "step": 13625 + }, + { + "epoch": 1.7333672560742908, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.01657772064209, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8441731929779053, + "num_tokens": 519748187.0, + "step": 13626 + }, + { + "epoch": 1.7334944663528813, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7976934909820557, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8598041534423828, + "num_tokens": 519790215.0, + "step": 13627 + }, + { + "epoch": 1.7336216766314718, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9556682109832764, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8543247580528259, + "num_tokens": 519827278.0, + "step": 13628 + }, + { + "epoch": 1.7337488869100623, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.513115167617798, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8690853118896484, + "num_tokens": 519860216.0, + "step": 13629 + }, + { + "epoch": 1.7338760971886529, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.906177282333374, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8614406585693359, + "num_tokens": 519894442.0, + "step": 13630 + }, + { + "epoch": 1.7340033074672434, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9574648141860962, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8628948330879211, + "num_tokens": 519932524.0, + "step": 13631 + }, + { + "epoch": 1.7341305177458337, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8722937107086182, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.860994815826416, + "num_tokens": 519974366.0, + "step": 13632 + }, + { + "epoch": 1.7342577280244242, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9414254426956177, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8724622130393982, + "num_tokens": 520013983.0, + "step": 13633 + }, + { + "epoch": 1.7343849383030148, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.027426242828369, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8645027875900269, + "num_tokens": 520047671.0, + "step": 13634 + }, + { + "epoch": 1.7345121485816053, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9017277956008911, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8587964773178101, + "num_tokens": 520086574.0, + "step": 13635 + }, + { + "epoch": 1.7346393588601958, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.829662799835205, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.84834885597229, + "num_tokens": 520130613.0, + "step": 13636 + }, + { + "epoch": 1.7347665691387864, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8813475370407104, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8806318640708923, + "num_tokens": 520163763.0, + "step": 13637 + }, + { + "epoch": 1.7348937794173769, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.061005115509033, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8599028587341309, + "num_tokens": 520198663.0, + "step": 13638 + }, + { + "epoch": 1.7350209896959674, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0546889305114746, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8661428093910217, + "num_tokens": 520231737.0, + "step": 13639 + }, + { + "epoch": 1.735148199974558, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.775014877319336, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8678335547447205, + "num_tokens": 520275664.0, + "step": 13640 + }, + { + "epoch": 1.7352754102531485, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.853887677192688, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8508418798446655, + "num_tokens": 520316100.0, + "step": 13641 + }, + { + "epoch": 1.735402620531739, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1815006732940674, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.85988849401474, + "num_tokens": 520348167.0, + "step": 13642 + }, + { + "epoch": 1.7355298308103295, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.062347650527954, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8494267463684082, + "num_tokens": 520385234.0, + "step": 13643 + }, + { + "epoch": 1.73565704108892, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9538202285766602, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8465210199356079, + "num_tokens": 520423051.0, + "step": 13644 + }, + { + "epoch": 1.7357842513675106, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8234943151474, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8536046743392944, + "num_tokens": 520467202.0, + "step": 13645 + }, + { + "epoch": 1.735911461646101, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.417931079864502, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8612475395202637, + "num_tokens": 520501434.0, + "step": 13646 + }, + { + "epoch": 1.7360386719246916, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9598827362060547, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8590466976165771, + "num_tokens": 520542757.0, + "step": 13647 + }, + { + "epoch": 1.7361658822032822, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.020653486251831, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8517993688583374, + "num_tokens": 520585216.0, + "step": 13648 + }, + { + "epoch": 1.7362930924818727, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9034640789031982, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8631933927536011, + "num_tokens": 520620066.0, + "step": 13649 + }, + { + "epoch": 1.736420302760463, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8365321159362793, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8685611486434937, + "num_tokens": 520660917.0, + "step": 13650 + }, + { + "epoch": 1.7365475130390535, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8947064876556396, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8546867966651917, + "num_tokens": 520704255.0, + "step": 13651 + }, + { + "epoch": 1.736674723317644, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.98768150806427, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8493896126747131, + "num_tokens": 520743036.0, + "step": 13652 + }, + { + "epoch": 1.7368019335962346, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.997185468673706, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8551104664802551, + "num_tokens": 520778325.0, + "step": 13653 + }, + { + "epoch": 1.736929143874825, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.986624002456665, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8750605583190918, + "num_tokens": 520815226.0, + "step": 13654 + }, + { + "epoch": 1.7370563541534156, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9702250957489014, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8422732353210449, + "num_tokens": 520856606.0, + "step": 13655 + }, + { + "epoch": 1.737183564432006, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.156391143798828, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8532601594924927, + "num_tokens": 520893493.0, + "step": 13656 + }, + { + "epoch": 1.7373107747105965, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9171298742294312, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8689137697219849, + "num_tokens": 520930391.0, + "step": 13657 + }, + { + "epoch": 1.737437984989187, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8422409296035767, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8626717925071716, + "num_tokens": 520970512.0, + "step": 13658 + }, + { + "epoch": 1.7375651952677775, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.897159218788147, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8540050983428955, + "num_tokens": 521011787.0, + "step": 13659 + }, + { + "epoch": 1.737692405546368, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8385086059570312, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8637669086456299, + "num_tokens": 521053494.0, + "step": 13660 + }, + { + "epoch": 1.7378196158249586, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.986160397529602, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8503574132919312, + "num_tokens": 521089291.0, + "step": 13661 + }, + { + "epoch": 1.7379468261035491, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8873827457427979, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8746722340583801, + "num_tokens": 521128910.0, + "step": 13662 + }, + { + "epoch": 1.7380740363821396, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9716744422912598, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8575136661529541, + "num_tokens": 521168175.0, + "step": 13663 + }, + { + "epoch": 1.7382012466607302, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8576737642288208, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8594367504119873, + "num_tokens": 521208020.0, + "step": 13664 + }, + { + "epoch": 1.7383284569393207, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.9166669845581055, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.873497486114502, + "num_tokens": 521249828.0, + "step": 13665 + }, + { + "epoch": 1.7384556672179112, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.871962070465088, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8655437231063843, + "num_tokens": 521289940.0, + "step": 13666 + }, + { + "epoch": 1.7385828774965018, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9302759170532227, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8803189992904663, + "num_tokens": 521327426.0, + "step": 13667 + }, + { + "epoch": 1.7387100877750923, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8342472314834595, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8535031080245972, + "num_tokens": 521372112.0, + "step": 13668 + }, + { + "epoch": 1.7388372980536828, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9677590131759644, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8615034818649292, + "num_tokens": 521410842.0, + "step": 13669 + }, + { + "epoch": 1.7389645083322733, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9032622575759888, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8718154430389404, + "num_tokens": 521446854.0, + "step": 13670 + }, + { + "epoch": 1.7390917186108639, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.964508295059204, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8532389402389526, + "num_tokens": 521486284.0, + "step": 13671 + }, + { + "epoch": 1.7392189288894544, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9775434732437134, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8491169810295105, + "num_tokens": 521522716.0, + "step": 13672 + }, + { + "epoch": 1.739346139168045, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.143846035003662, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8572648763656616, + "num_tokens": 521551549.0, + "step": 13673 + }, + { + "epoch": 1.7394733494466355, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9076344966888428, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8657020926475525, + "num_tokens": 521589552.0, + "step": 13674 + }, + { + "epoch": 1.7396005597252258, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8496496677398682, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8736082315444946, + "num_tokens": 521627906.0, + "step": 13675 + }, + { + "epoch": 1.7397277700038163, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9144811630249023, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8464375734329224, + "num_tokens": 521666615.0, + "step": 13676 + }, + { + "epoch": 1.7398549802824068, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9897739887237549, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8599748611450195, + "num_tokens": 521707742.0, + "step": 13677 + }, + { + "epoch": 1.7399821905609973, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.645383596420288, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8515545129776001, + "num_tokens": 521746848.0, + "step": 13678 + }, + { + "epoch": 1.7401094008395879, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9056562185287476, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.862862765789032, + "num_tokens": 521782917.0, + "step": 13679 + }, + { + "epoch": 1.7402366111181784, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9096450805664062, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8598849177360535, + "num_tokens": 521825609.0, + "step": 13680 + }, + { + "epoch": 1.7403638213967687, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8859834671020508, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8748316764831543, + "num_tokens": 521862277.0, + "step": 13681 + }, + { + "epoch": 1.7404910316753592, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.214959144592285, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8810405731201172, + "num_tokens": 521894951.0, + "step": 13682 + }, + { + "epoch": 1.7406182419539498, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.19305682182312, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8615943789482117, + "num_tokens": 521932158.0, + "step": 13683 + }, + { + "epoch": 1.7407454522325403, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.917594075202942, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8679947853088379, + "num_tokens": 521971050.0, + "step": 13684 + }, + { + "epoch": 1.7408726625111308, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.842854619026184, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8720850944519043, + "num_tokens": 522007617.0, + "step": 13685 + }, + { + "epoch": 1.7409998727897213, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 5.054605007171631, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8505235910415649, + "num_tokens": 522053484.0, + "step": 13686 + }, + { + "epoch": 1.7411270830683119, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.114827871322632, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8572337627410889, + "num_tokens": 522088047.0, + "step": 13687 + }, + { + "epoch": 1.7412542933469024, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.952924132347107, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8613881468772888, + "num_tokens": 522129865.0, + "step": 13688 + }, + { + "epoch": 1.741381503625493, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9133830070495605, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8577871322631836, + "num_tokens": 522170154.0, + "step": 13689 + }, + { + "epoch": 1.7415087139040835, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8010975122451782, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8863115310668945, + "num_tokens": 522204586.0, + "step": 13690 + }, + { + "epoch": 1.741635924182674, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8120242357254028, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8676201105117798, + "num_tokens": 522241942.0, + "step": 13691 + }, + { + "epoch": 1.7417631344612645, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.046935796737671, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8583157062530518, + "num_tokens": 522275800.0, + "step": 13692 + }, + { + "epoch": 1.741890344739855, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7431484460830688, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8566413521766663, + "num_tokens": 522317024.0, + "step": 13693 + }, + { + "epoch": 1.7420175550184456, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7690778970718384, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8728063106536865, + "num_tokens": 522358510.0, + "step": 13694 + }, + { + "epoch": 1.742144765297036, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2542202472686768, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.854714572429657, + "num_tokens": 522395960.0, + "step": 13695 + }, + { + "epoch": 1.7422719755756266, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.958548665046692, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8713139295578003, + "num_tokens": 522428807.0, + "step": 13696 + }, + { + "epoch": 1.7423991858542172, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.926320195198059, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8636645674705505, + "num_tokens": 522464472.0, + "step": 13697 + }, + { + "epoch": 1.7425263961328077, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.691834807395935, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8754436373710632, + "num_tokens": 522509522.0, + "step": 13698 + }, + { + "epoch": 1.742653606411398, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8255574703216553, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8468025922775269, + "num_tokens": 522545581.0, + "step": 13699 + }, + { + "epoch": 1.7427808166899885, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8232532739639282, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8656116724014282, + "num_tokens": 522582072.0, + "step": 13700 + }, + { + "epoch": 1.742908026968579, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9258208274841309, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.86534184217453, + "num_tokens": 522618183.0, + "step": 13701 + }, + { + "epoch": 1.7430352372471696, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8046281337738037, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.851662278175354, + "num_tokens": 522660578.0, + "step": 13702 + }, + { + "epoch": 1.74316244752576, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.913392186164856, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8735688924789429, + "num_tokens": 522701934.0, + "step": 13703 + }, + { + "epoch": 1.7432896578043506, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8985264301300049, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.880703866481781, + "num_tokens": 522737404.0, + "step": 13704 + }, + { + "epoch": 1.743416868082941, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.921370267868042, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8586283326148987, + "num_tokens": 522774188.0, + "step": 13705 + }, + { + "epoch": 1.7435440783615315, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9681199789047241, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8710933327674866, + "num_tokens": 522810467.0, + "step": 13706 + }, + { + "epoch": 1.743671288640122, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.964386224746704, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8518788814544678, + "num_tokens": 522845246.0, + "step": 13707 + }, + { + "epoch": 1.7437984989187125, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0319724082946777, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8731639385223389, + "num_tokens": 522877150.0, + "step": 13708 + }, + { + "epoch": 1.743925709197303, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9198346138000488, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.877250075340271, + "num_tokens": 522913161.0, + "step": 13709 + }, + { + "epoch": 1.7440529194758936, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8775267601013184, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8493136763572693, + "num_tokens": 522953464.0, + "step": 13710 + }, + { + "epoch": 1.744180129754484, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.910858392715454, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8674353361129761, + "num_tokens": 522996874.0, + "step": 13711 + }, + { + "epoch": 1.7443073400330746, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0196352005004883, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8649620413780212, + "num_tokens": 523031349.0, + "step": 13712 + }, + { + "epoch": 1.7444345503116652, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9366843700408936, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8555424809455872, + "num_tokens": 523068910.0, + "step": 13713 + }, + { + "epoch": 1.7445617605902557, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9929288625717163, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8779085874557495, + "num_tokens": 523106179.0, + "step": 13714 + }, + { + "epoch": 1.7446889708688462, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9534140825271606, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8648086190223694, + "num_tokens": 523140827.0, + "step": 13715 + }, + { + "epoch": 1.7448161811474368, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7294338941574097, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.876061737537384, + "num_tokens": 523182880.0, + "step": 13716 + }, + { + "epoch": 1.7449433914260273, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0567681789398193, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8669289946556091, + "num_tokens": 523213059.0, + "step": 13717 + }, + { + "epoch": 1.7450706017046178, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9210491180419922, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8615507483482361, + "num_tokens": 523254398.0, + "step": 13718 + }, + { + "epoch": 1.7451978119832083, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7533538341522217, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.874998927116394, + "num_tokens": 523293325.0, + "step": 13719 + }, + { + "epoch": 1.7453250222617989, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9133564233779907, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8523732423782349, + "num_tokens": 523334402.0, + "step": 13720 + }, + { + "epoch": 1.7454522325403894, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7830369472503662, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.86732017993927, + "num_tokens": 523376238.0, + "step": 13721 + }, + { + "epoch": 1.74557944281898, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.783245325088501, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8854323625564575, + "num_tokens": 523417074.0, + "step": 13722 + }, + { + "epoch": 1.7457066530975704, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7451103925704956, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8705617189407349, + "num_tokens": 523457984.0, + "step": 13723 + }, + { + "epoch": 1.7458338633761608, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.126110553741455, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8581196069717407, + "num_tokens": 523490071.0, + "step": 13724 + }, + { + "epoch": 1.7459610736547513, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.170992136001587, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8566553592681885, + "num_tokens": 523530105.0, + "step": 13725 + }, + { + "epoch": 1.7460882839333418, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.704079270362854, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8766276836395264, + "num_tokens": 523576698.0, + "step": 13726 + }, + { + "epoch": 1.7462154942119323, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7865811586380005, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8641436100006104, + "num_tokens": 523614387.0, + "step": 13727 + }, + { + "epoch": 1.7463427044905229, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0409417152404785, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8586379289627075, + "num_tokens": 523651149.0, + "step": 13728 + }, + { + "epoch": 1.7464699147691134, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.07450270652771, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8754409551620483, + "num_tokens": 523688448.0, + "step": 13729 + }, + { + "epoch": 1.7465971250477037, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7985174655914307, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8669949769973755, + "num_tokens": 523725805.0, + "step": 13730 + }, + { + "epoch": 1.7467243353262942, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9963244199752808, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8639693260192871, + "num_tokens": 523763564.0, + "step": 13731 + }, + { + "epoch": 1.7468515456048848, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9311751127243042, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8589881062507629, + "num_tokens": 523804422.0, + "step": 13732 + }, + { + "epoch": 1.7469787558834753, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7948659658432007, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8737359046936035, + "num_tokens": 523845889.0, + "step": 13733 + }, + { + "epoch": 1.7471059661620658, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.866274118423462, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8743826746940613, + "num_tokens": 523883623.0, + "step": 13734 + }, + { + "epoch": 1.7472331764406563, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.04960036277771, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8708124756813049, + "num_tokens": 523917273.0, + "step": 13735 + }, + { + "epoch": 1.7473603867192469, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8594392538070679, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8614476919174194, + "num_tokens": 523956149.0, + "step": 13736 + }, + { + "epoch": 1.7474875969978374, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8305282592773438, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8604885935783386, + "num_tokens": 523997229.0, + "step": 13737 + }, + { + "epoch": 1.747614807276428, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8537161350250244, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8687249422073364, + "num_tokens": 524032921.0, + "step": 13738 + }, + { + "epoch": 1.7477420175550185, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.195589065551758, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8652598261833191, + "num_tokens": 524065016.0, + "step": 13739 + }, + { + "epoch": 1.747869227833609, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8275827169418335, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8515585660934448, + "num_tokens": 524107001.0, + "step": 13740 + }, + { + "epoch": 1.7479964381121995, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8161216974258423, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8588482141494751, + "num_tokens": 524149283.0, + "step": 13741 + }, + { + "epoch": 1.74812364839079, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.948430061340332, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8632595539093018, + "num_tokens": 524187240.0, + "step": 13742 + }, + { + "epoch": 1.7482508586693806, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.888982892036438, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8725025057792664, + "num_tokens": 524216793.0, + "step": 13743 + }, + { + "epoch": 1.748378068947971, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8492671251296997, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8687874674797058, + "num_tokens": 524257629.0, + "step": 13744 + }, + { + "epoch": 1.7485052792265616, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.845115065574646, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8745929002761841, + "num_tokens": 524291090.0, + "step": 13745 + }, + { + "epoch": 1.7486324895051522, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8790316581726074, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8778917193412781, + "num_tokens": 524324882.0, + "step": 13746 + }, + { + "epoch": 1.7487596997837427, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9400304555892944, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8476837873458862, + "num_tokens": 524359422.0, + "step": 13747 + }, + { + "epoch": 1.748886910062333, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8494993448257446, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8686977624893188, + "num_tokens": 524398383.0, + "step": 13748 + }, + { + "epoch": 1.7490141203409235, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.204263687133789, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8602689504623413, + "num_tokens": 524437984.0, + "step": 13749 + }, + { + "epoch": 1.749141330619514, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.054619073867798, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.868610143661499, + "num_tokens": 524471324.0, + "step": 13750 + }, + { + "epoch": 1.7492685408981046, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9860060214996338, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8545553684234619, + "num_tokens": 524511471.0, + "step": 13751 + }, + { + "epoch": 1.749395751176695, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9791607856750488, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8666967153549194, + "num_tokens": 524541743.0, + "step": 13752 + }, + { + "epoch": 1.7495229614552856, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7717899084091187, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8642334342002869, + "num_tokens": 524581470.0, + "step": 13753 + }, + { + "epoch": 1.749650171733876, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 7.750089168548584, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8655016422271729, + "num_tokens": 524612722.0, + "step": 13754 + }, + { + "epoch": 1.7497773820124665, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8660099506378174, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8720685839653015, + "num_tokens": 524647640.0, + "step": 13755 + }, + { + "epoch": 1.749904592291057, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0863614082336426, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.857210636138916, + "num_tokens": 524682366.0, + "step": 13756 + }, + { + "epoch": 1.7500318025696475, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 20.57831573486328, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8636171817779541, + "num_tokens": 524724242.0, + "step": 13757 + }, + { + "epoch": 1.750159012848238, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0708675384521484, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8489823341369629, + "num_tokens": 524757373.0, + "step": 13758 + }, + { + "epoch": 1.7502862231268286, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9364093542099, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8538960814476013, + "num_tokens": 524797704.0, + "step": 13759 + }, + { + "epoch": 1.750413433405419, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.208803176879883, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8653850555419922, + "num_tokens": 524836710.0, + "step": 13760 + }, + { + "epoch": 1.7505406436840096, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9478434324264526, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8617535829544067, + "num_tokens": 524870224.0, + "step": 13761 + }, + { + "epoch": 1.7506678539626002, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9315617084503174, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8663313388824463, + "num_tokens": 524905345.0, + "step": 13762 + }, + { + "epoch": 1.7507950642411907, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9972813129425049, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8465245366096497, + "num_tokens": 524943893.0, + "step": 13763 + }, + { + "epoch": 1.7509222745197812, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7799506187438965, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8614489436149597, + "num_tokens": 524982924.0, + "step": 13764 + }, + { + "epoch": 1.7510494847983717, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.888671875, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8623442053794861, + "num_tokens": 525016749.0, + "step": 13765 + }, + { + "epoch": 1.7511766950769623, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1191859245300293, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8662161827087402, + "num_tokens": 525049178.0, + "step": 13766 + }, + { + "epoch": 1.7513039053555528, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0214500427246094, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8493265509605408, + "num_tokens": 525083456.0, + "step": 13767 + }, + { + "epoch": 1.7514311156341433, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9889370203018188, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.86659175157547, + "num_tokens": 525120159.0, + "step": 13768 + }, + { + "epoch": 1.7515583259127339, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.960382103919983, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8683477640151978, + "num_tokens": 525159174.0, + "step": 13769 + }, + { + "epoch": 1.7516855361913244, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9149774312973022, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8623180389404297, + "num_tokens": 525198373.0, + "step": 13770 + }, + { + "epoch": 1.751812746469915, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7575578689575195, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.870042622089386, + "num_tokens": 525240618.0, + "step": 13771 + }, + { + "epoch": 1.7519399567485054, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9579648971557617, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8788806200027466, + "num_tokens": 525274899.0, + "step": 13772 + }, + { + "epoch": 1.7520671670270958, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 7.745002746582031, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.842718243598938, + "num_tokens": 525310144.0, + "step": 13773 + }, + { + "epoch": 1.7521943773056863, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.629188060760498, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8734399080276489, + "num_tokens": 525348565.0, + "step": 13774 + }, + { + "epoch": 1.7523215875842768, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.13739275932312, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8702812194824219, + "num_tokens": 525383709.0, + "step": 13775 + }, + { + "epoch": 1.7524487978628673, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9446887969970703, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8576989769935608, + "num_tokens": 525423983.0, + "step": 13776 + }, + { + "epoch": 1.7525760081414579, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9329044818878174, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8702911734580994, + "num_tokens": 525457357.0, + "step": 13777 + }, + { + "epoch": 1.7527032184200484, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8733556270599365, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8620259761810303, + "num_tokens": 525497021.0, + "step": 13778 + }, + { + "epoch": 1.7528304286986387, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7964218854904175, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8794347643852234, + "num_tokens": 525533592.0, + "step": 13779 + }, + { + "epoch": 1.7529576389772292, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.831272840499878, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8565250635147095, + "num_tokens": 525572747.0, + "step": 13780 + }, + { + "epoch": 1.7530848492558198, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9047281742095947, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8695876598358154, + "num_tokens": 525613578.0, + "step": 13781 + }, + { + "epoch": 1.7532120595344103, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9758176803588867, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8703333735466003, + "num_tokens": 525655047.0, + "step": 13782 + }, + { + "epoch": 1.7533392698130008, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.926832914352417, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8597012758255005, + "num_tokens": 525689561.0, + "step": 13783 + }, + { + "epoch": 1.7534664800915913, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8819323778152466, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.860824704170227, + "num_tokens": 525726946.0, + "step": 13784 + }, + { + "epoch": 1.7535936903701819, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.013532876968384, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8562170267105103, + "num_tokens": 525762089.0, + "step": 13785 + }, + { + "epoch": 1.7537209006487724, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.063438892364502, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8689044117927551, + "num_tokens": 525796081.0, + "step": 13786 + }, + { + "epoch": 1.753848110927363, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1802330017089844, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8532220721244812, + "num_tokens": 525829945.0, + "step": 13787 + }, + { + "epoch": 1.7539753212059535, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9367761611938477, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8725492358207703, + "num_tokens": 525871289.0, + "step": 13788 + }, + { + "epoch": 1.754102531484544, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.828608751296997, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8648599982261658, + "num_tokens": 525908455.0, + "step": 13789 + }, + { + "epoch": 1.7542297417631345, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0369017124176025, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8845335245132446, + "num_tokens": 525943734.0, + "step": 13790 + }, + { + "epoch": 1.754356952041725, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6710656881332397, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8876720070838928, + "num_tokens": 525988070.0, + "step": 13791 + }, + { + "epoch": 1.7544841623203156, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7747398614883423, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8791736960411072, + "num_tokens": 526027940.0, + "step": 13792 + }, + { + "epoch": 1.754611372598906, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.914721965789795, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8750431537628174, + "num_tokens": 526061357.0, + "step": 13793 + }, + { + "epoch": 1.7547385828774966, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.742956519126892, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8669930696487427, + "num_tokens": 526102157.0, + "step": 13794 + }, + { + "epoch": 1.7548657931560872, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0343003273010254, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8554972410202026, + "num_tokens": 526137241.0, + "step": 13795 + }, + { + "epoch": 1.7549930034346777, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0851876735687256, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8565816879272461, + "num_tokens": 526169079.0, + "step": 13796 + }, + { + "epoch": 1.755120213713268, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8185323476791382, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8686795234680176, + "num_tokens": 526212212.0, + "step": 13797 + }, + { + "epoch": 1.7552474239918585, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8770158290863037, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8721124529838562, + "num_tokens": 526250380.0, + "step": 13798 + }, + { + "epoch": 1.755374634270449, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7441036701202393, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8697716593742371, + "num_tokens": 526292525.0, + "step": 13799 + }, + { + "epoch": 1.7555018445490396, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8136299848556519, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.865426778793335, + "num_tokens": 526330427.0, + "step": 13800 + }, + { + "epoch": 1.75562905482763, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.956278920173645, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8650232553482056, + "num_tokens": 526362954.0, + "step": 13801 + }, + { + "epoch": 1.7557562651062206, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2969281673431396, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8577213287353516, + "num_tokens": 526394310.0, + "step": 13802 + }, + { + "epoch": 1.755883475384811, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9932324886322021, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8453370928764343, + "num_tokens": 526438507.0, + "step": 13803 + }, + { + "epoch": 1.7560106856634015, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9278011322021484, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.861350417137146, + "num_tokens": 526475124.0, + "step": 13804 + }, + { + "epoch": 1.756137895941992, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8177416324615479, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8719167709350586, + "num_tokens": 526514936.0, + "step": 13805 + }, + { + "epoch": 1.7562651062205825, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9419784545898438, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8587900996208191, + "num_tokens": 526548649.0, + "step": 13806 + }, + { + "epoch": 1.756392316499173, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.873669147491455, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.84897381067276, + "num_tokens": 526589028.0, + "step": 13807 + }, + { + "epoch": 1.7565195267777636, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7988327741622925, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8639148473739624, + "num_tokens": 526626327.0, + "step": 13808 + }, + { + "epoch": 1.756646737056354, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6862027645111084, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8608040809631348, + "num_tokens": 526672264.0, + "step": 13809 + }, + { + "epoch": 1.7567739473349446, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.7276198863983154, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8574049472808838, + "num_tokens": 526709979.0, + "step": 13810 + }, + { + "epoch": 1.7569011576135352, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9737306833267212, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8780489563941956, + "num_tokens": 526745002.0, + "step": 13811 + }, + { + "epoch": 1.7570283678921257, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7447068691253662, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8799797296524048, + "num_tokens": 526785684.0, + "step": 13812 + }, + { + "epoch": 1.7571555781707162, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8775511980056763, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.883411705493927, + "num_tokens": 526820246.0, + "step": 13813 + }, + { + "epoch": 1.7572827884493067, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9134458303451538, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8675429821014404, + "num_tokens": 526854815.0, + "step": 13814 + }, + { + "epoch": 1.7574099987278973, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6865273714065552, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8708024621009827, + "num_tokens": 526896336.0, + "step": 13815 + }, + { + "epoch": 1.7575372090064878, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.677366852760315, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8753029704093933, + "num_tokens": 526939586.0, + "step": 13816 + }, + { + "epoch": 1.7576644192850783, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0498929023742676, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8691891431808472, + "num_tokens": 526975027.0, + "step": 13817 + }, + { + "epoch": 1.7577916295636689, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8345903158187866, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.86688631772995, + "num_tokens": 527017820.0, + "step": 13818 + }, + { + "epoch": 1.7579188398422594, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.7572474479675293, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8755760192871094, + "num_tokens": 527051647.0, + "step": 13819 + }, + { + "epoch": 1.75804605012085, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8249671459197998, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8690064549446106, + "num_tokens": 527089590.0, + "step": 13820 + }, + { + "epoch": 1.7581732603994404, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1483876705169678, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8630805611610413, + "num_tokens": 527126696.0, + "step": 13821 + }, + { + "epoch": 1.7583004706780307, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0614233016967773, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8556117415428162, + "num_tokens": 527164823.0, + "step": 13822 + }, + { + "epoch": 1.7584276809566213, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.043699264526367, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8475807905197144, + "num_tokens": 527208032.0, + "step": 13823 + }, + { + "epoch": 1.7585548912352118, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0123579502105713, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8674647808074951, + "num_tokens": 527242773.0, + "step": 13824 + }, + { + "epoch": 1.7586821015138023, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8639780282974243, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8680968284606934, + "num_tokens": 527277968.0, + "step": 13825 + }, + { + "epoch": 1.7588093117923929, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8790000677108765, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8784854412078857, + "num_tokens": 527311077.0, + "step": 13826 + }, + { + "epoch": 1.7589365220709834, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0371253490448, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8717993497848511, + "num_tokens": 527340543.0, + "step": 13827 + }, + { + "epoch": 1.7590637323495737, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0585875511169434, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8628589510917664, + "num_tokens": 527379596.0, + "step": 13828 + }, + { + "epoch": 1.7591909426281642, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8633666038513184, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8641674518585205, + "num_tokens": 527417873.0, + "step": 13829 + }, + { + "epoch": 1.7593181529067548, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.835689902305603, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8680359125137329, + "num_tokens": 527458607.0, + "step": 13830 + }, + { + "epoch": 1.7594453631853453, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.082702159881592, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8570427894592285, + "num_tokens": 527493299.0, + "step": 13831 + }, + { + "epoch": 1.7595725734639358, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.848749041557312, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8526278734207153, + "num_tokens": 527536065.0, + "step": 13832 + }, + { + "epoch": 1.7596997837425263, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9047291278839111, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8493643999099731, + "num_tokens": 527576477.0, + "step": 13833 + }, + { + "epoch": 1.7598269940211169, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 7.788997173309326, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8626431226730347, + "num_tokens": 527611777.0, + "step": 13834 + }, + { + "epoch": 1.7599542042997074, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8564929962158203, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8818740844726562, + "num_tokens": 527650236.0, + "step": 13835 + }, + { + "epoch": 1.760081414578298, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8762168884277344, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8702961206436157, + "num_tokens": 527685486.0, + "step": 13836 + }, + { + "epoch": 1.7602086248568884, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8763091564178467, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8517363667488098, + "num_tokens": 527725386.0, + "step": 13837 + }, + { + "epoch": 1.760335835135479, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.063216209411621, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8530439138412476, + "num_tokens": 527756213.0, + "step": 13838 + }, + { + "epoch": 1.7604630454140695, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.799940586090088, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8696590662002563, + "num_tokens": 527793884.0, + "step": 13839 + }, + { + "epoch": 1.76059025569266, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9760940074920654, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8625181317329407, + "num_tokens": 527826428.0, + "step": 13840 + }, + { + "epoch": 1.7607174659712506, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 3.0900261402130127, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.858314037322998, + "num_tokens": 527866691.0, + "step": 13841 + }, + { + "epoch": 1.760844676249841, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8343658447265625, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8738601803779602, + "num_tokens": 527909482.0, + "step": 13842 + }, + { + "epoch": 1.7609718865284316, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.854834794998169, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8531812429428101, + "num_tokens": 527947818.0, + "step": 13843 + }, + { + "epoch": 1.7610990968070221, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7628270387649536, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8678152561187744, + "num_tokens": 527986760.0, + "step": 13844 + }, + { + "epoch": 1.7612263070856127, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.891697883605957, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8588445782661438, + "num_tokens": 528024742.0, + "step": 13845 + }, + { + "epoch": 1.761353517364203, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9550175666809082, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8491215705871582, + "num_tokens": 528060781.0, + "step": 13846 + }, + { + "epoch": 1.7614807276427935, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.004423141479492, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.867100715637207, + "num_tokens": 528099051.0, + "step": 13847 + }, + { + "epoch": 1.761607937921384, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8681656122207642, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8581976294517517, + "num_tokens": 528138325.0, + "step": 13848 + }, + { + "epoch": 1.7617351481999746, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7750566005706787, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8555111885070801, + "num_tokens": 528177997.0, + "step": 13849 + }, + { + "epoch": 1.761862358478565, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8138837814331055, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8619704842567444, + "num_tokens": 528216360.0, + "step": 13850 + }, + { + "epoch": 1.7619895687571556, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8210219144821167, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8812029361724854, + "num_tokens": 528251100.0, + "step": 13851 + }, + { + "epoch": 1.762116779035746, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0584030151367188, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8685305118560791, + "num_tokens": 528286298.0, + "step": 13852 + }, + { + "epoch": 1.7622439893143365, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8109334707260132, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8802615404129028, + "num_tokens": 528322970.0, + "step": 13853 + }, + { + "epoch": 1.762371199592927, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.936599612236023, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8804764747619629, + "num_tokens": 528355191.0, + "step": 13854 + }, + { + "epoch": 1.7624984098715175, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8494670391082764, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8641143441200256, + "num_tokens": 528388975.0, + "step": 13855 + }, + { + "epoch": 1.762625620150108, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.709546446800232, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8693914413452148, + "num_tokens": 528432139.0, + "step": 13856 + }, + { + "epoch": 1.7627528304286986, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9072827100753784, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8735160231590271, + "num_tokens": 528468315.0, + "step": 13857 + }, + { + "epoch": 1.762880040707289, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8050929307937622, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8543429374694824, + "num_tokens": 528510368.0, + "step": 13858 + }, + { + "epoch": 1.7630072509858796, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 3.0530107021331787, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8668491244316101, + "num_tokens": 528545637.0, + "step": 13859 + }, + { + "epoch": 1.7631344612644702, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8764064311981201, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8691079616546631, + "num_tokens": 528586267.0, + "step": 13860 + }, + { + "epoch": 1.7632616715430607, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7641675472259521, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8686329126358032, + "num_tokens": 528629640.0, + "step": 13861 + }, + { + "epoch": 1.7633888818216512, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8759883642196655, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.850287675857544, + "num_tokens": 528665373.0, + "step": 13862 + }, + { + "epoch": 1.7635160921002417, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8798844814300537, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8795532584190369, + "num_tokens": 528703426.0, + "step": 13863 + }, + { + "epoch": 1.7636433023788323, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1494088172912598, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8448991775512695, + "num_tokens": 528744886.0, + "step": 13864 + }, + { + "epoch": 1.7637705126574228, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8613879680633545, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8768960237503052, + "num_tokens": 528779773.0, + "step": 13865 + }, + { + "epoch": 1.7638977229360133, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.822709321975708, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.867039680480957, + "num_tokens": 528821878.0, + "step": 13866 + }, + { + "epoch": 1.7640249332146039, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.063244581222534, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8659704923629761, + "num_tokens": 528858056.0, + "step": 13867 + }, + { + "epoch": 1.7641521434931944, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7532274723052979, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.860727071762085, + "num_tokens": 528903116.0, + "step": 13868 + }, + { + "epoch": 1.764279353771785, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8292834758758545, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8587354421615601, + "num_tokens": 528941561.0, + "step": 13869 + }, + { + "epoch": 1.7644065640503754, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.862490177154541, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8805602192878723, + "num_tokens": 528976215.0, + "step": 13870 + }, + { + "epoch": 1.7645337743289657, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7922389507293701, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8721601366996765, + "num_tokens": 529019280.0, + "step": 13871 + }, + { + "epoch": 1.7646609846075563, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.772185206413269, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8680781126022339, + "num_tokens": 529060733.0, + "step": 13872 + }, + { + "epoch": 1.7647881948861468, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8808562755584717, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8665264844894409, + "num_tokens": 529094948.0, + "step": 13873 + }, + { + "epoch": 1.7649154051647373, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7708183526992798, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8765624761581421, + "num_tokens": 529131374.0, + "step": 13874 + }, + { + "epoch": 1.7650426154433279, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9473999738693237, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8618334531784058, + "num_tokens": 529169133.0, + "step": 13875 + }, + { + "epoch": 1.7651698257219184, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.857265830039978, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8643851280212402, + "num_tokens": 529206467.0, + "step": 13876 + }, + { + "epoch": 1.7652970360005087, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9547386169433594, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8550960421562195, + "num_tokens": 529244891.0, + "step": 13877 + }, + { + "epoch": 1.7654242462790992, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1656997203826904, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8662330508232117, + "num_tokens": 529277384.0, + "step": 13878 + }, + { + "epoch": 1.7655514565576897, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.767327904701233, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8681769371032715, + "num_tokens": 529322530.0, + "step": 13879 + }, + { + "epoch": 1.7656786668362803, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.979915738105774, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8592202067375183, + "num_tokens": 529356345.0, + "step": 13880 + }, + { + "epoch": 1.7658058771148708, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.925876498222351, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8669971227645874, + "num_tokens": 529391877.0, + "step": 13881 + }, + { + "epoch": 1.7659330873934613, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9321684837341309, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8715379238128662, + "num_tokens": 529425460.0, + "step": 13882 + }, + { + "epoch": 1.7660602976720519, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.951490044593811, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8541085720062256, + "num_tokens": 529459660.0, + "step": 13883 + }, + { + "epoch": 1.7661875079506424, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9905155897140503, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8569520711898804, + "num_tokens": 529501839.0, + "step": 13884 + }, + { + "epoch": 1.766314718229233, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9414464235305786, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8621424436569214, + "num_tokens": 529540828.0, + "step": 13885 + }, + { + "epoch": 1.7664419285078234, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8459076881408691, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8653282523155212, + "num_tokens": 529581578.0, + "step": 13886 + }, + { + "epoch": 1.766569138786414, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8659216165542603, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8647701740264893, + "num_tokens": 529624363.0, + "step": 13887 + }, + { + "epoch": 1.7666963490650045, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8065040111541748, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8728489875793457, + "num_tokens": 529665286.0, + "step": 13888 + }, + { + "epoch": 1.766823559343595, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9278037548065186, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8617368340492249, + "num_tokens": 529704150.0, + "step": 13889 + }, + { + "epoch": 1.7669507696221856, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.912163496017456, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8672142028808594, + "num_tokens": 529742110.0, + "step": 13890 + }, + { + "epoch": 1.767077979900776, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9098708629608154, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.864727258682251, + "num_tokens": 529776158.0, + "step": 13891 + }, + { + "epoch": 1.7672051901793666, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.114905834197998, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8411906361579895, + "num_tokens": 529814002.0, + "step": 13892 + }, + { + "epoch": 1.7673324004579571, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9283620119094849, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8674142360687256, + "num_tokens": 529846696.0, + "step": 13893 + }, + { + "epoch": 1.7674596107365477, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9437029361724854, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8619911670684814, + "num_tokens": 529881557.0, + "step": 13894 + }, + { + "epoch": 1.767586821015138, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.806249976158142, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8513422608375549, + "num_tokens": 529924337.0, + "step": 13895 + }, + { + "epoch": 1.7677140312937285, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9328910112380981, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.862646222114563, + "num_tokens": 529967195.0, + "step": 13896 + }, + { + "epoch": 1.767841241572319, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.168424606323242, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8461885452270508, + "num_tokens": 530003454.0, + "step": 13897 + }, + { + "epoch": 1.7679684518509096, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9811198711395264, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8388371467590332, + "num_tokens": 530042577.0, + "step": 13898 + }, + { + "epoch": 1.7680956621295, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8877328634262085, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8490234613418579, + "num_tokens": 530084903.0, + "step": 13899 + }, + { + "epoch": 1.7682228724080906, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0479373931884766, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8611910939216614, + "num_tokens": 530117598.0, + "step": 13900 + }, + { + "epoch": 1.768350082686681, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8871543407440186, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8678012490272522, + "num_tokens": 530151891.0, + "step": 13901 + }, + { + "epoch": 1.7684772929652715, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8092107772827148, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8556510210037231, + "num_tokens": 530191263.0, + "step": 13902 + }, + { + "epoch": 1.768604503243862, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8216744661331177, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8601375222206116, + "num_tokens": 530231156.0, + "step": 13903 + }, + { + "epoch": 1.7687317135224525, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9543328285217285, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8781338930130005, + "num_tokens": 530270089.0, + "step": 13904 + }, + { + "epoch": 1.768858923801043, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8499908447265625, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.868080735206604, + "num_tokens": 530309808.0, + "step": 13905 + }, + { + "epoch": 1.7689861340796336, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.721694827079773, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8528703451156616, + "num_tokens": 530357723.0, + "step": 13906 + }, + { + "epoch": 1.769113344358224, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.6245710849761963, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8521417379379272, + "num_tokens": 530389586.0, + "step": 13907 + }, + { + "epoch": 1.7692405546368146, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8529142141342163, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8542037010192871, + "num_tokens": 530430356.0, + "step": 13908 + }, + { + "epoch": 1.7693677649154052, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7024258375167847, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8696783781051636, + "num_tokens": 530472859.0, + "step": 13909 + }, + { + "epoch": 1.7694949751939957, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.900097131729126, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8517929315567017, + "num_tokens": 530515422.0, + "step": 13910 + }, + { + "epoch": 1.7696221854725862, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.723235011100769, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.862777829170227, + "num_tokens": 530553842.0, + "step": 13911 + }, + { + "epoch": 1.7697493957511767, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7609083652496338, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.863095223903656, + "num_tokens": 530592593.0, + "step": 13912 + }, + { + "epoch": 1.7698766060297673, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8846979141235352, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8581209778785706, + "num_tokens": 530632309.0, + "step": 13913 + }, + { + "epoch": 1.7700038163083578, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.963265299797058, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8596386313438416, + "num_tokens": 530667415.0, + "step": 13914 + }, + { + "epoch": 1.7701310265869483, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7966364622116089, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8608604073524475, + "num_tokens": 530705736.0, + "step": 13915 + }, + { + "epoch": 1.7702582368655388, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8576995134353638, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8708367347717285, + "num_tokens": 530744931.0, + "step": 13916 + }, + { + "epoch": 1.7703854471441294, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6860464811325073, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.876476526260376, + "num_tokens": 530784700.0, + "step": 13917 + }, + { + "epoch": 1.77051265742272, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8035656213760376, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8801456689834595, + "num_tokens": 530821097.0, + "step": 13918 + }, + { + "epoch": 1.7706398677013102, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9618762731552124, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8576160669326782, + "num_tokens": 530860444.0, + "step": 13919 + }, + { + "epoch": 1.7707670779799007, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7729860544204712, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8611847162246704, + "num_tokens": 530907304.0, + "step": 13920 + }, + { + "epoch": 1.7708942882584913, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.929154634475708, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.854861855506897, + "num_tokens": 530945692.0, + "step": 13921 + }, + { + "epoch": 1.7710214985370818, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.851793885231018, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8730063438415527, + "num_tokens": 530986695.0, + "step": 13922 + }, + { + "epoch": 1.7711487088156723, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8429580926895142, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8747994303703308, + "num_tokens": 531022656.0, + "step": 13923 + }, + { + "epoch": 1.7712759190942629, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9015854597091675, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8711722493171692, + "num_tokens": 531056896.0, + "step": 13924 + }, + { + "epoch": 1.7714031293728534, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0449090003967285, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8391506671905518, + "num_tokens": 531097286.0, + "step": 13925 + }, + { + "epoch": 1.7715303396514437, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7810178995132446, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8720237016677856, + "num_tokens": 531136430.0, + "step": 13926 + }, + { + "epoch": 1.7716575499300342, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0490047931671143, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8555736541748047, + "num_tokens": 531176698.0, + "step": 13927 + }, + { + "epoch": 1.7717847602086247, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8083134889602661, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8617949485778809, + "num_tokens": 531214343.0, + "step": 13928 + }, + { + "epoch": 1.7719119704872153, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8462663888931274, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8759187459945679, + "num_tokens": 531252325.0, + "step": 13929 + }, + { + "epoch": 1.7720391807658058, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.5896520614624023, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8513270616531372, + "num_tokens": 531290378.0, + "step": 13930 + }, + { + "epoch": 1.7721663910443963, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8882664442062378, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8535834550857544, + "num_tokens": 531330406.0, + "step": 13931 + }, + { + "epoch": 1.7722936013229869, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.921802282333374, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8635994791984558, + "num_tokens": 531367425.0, + "step": 13932 + }, + { + "epoch": 1.7724208116015774, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8040677309036255, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8840726613998413, + "num_tokens": 531406465.0, + "step": 13933 + }, + { + "epoch": 1.772548021880168, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8516876697540283, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8533037900924683, + "num_tokens": 531445356.0, + "step": 13934 + }, + { + "epoch": 1.7726752321587584, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8088122606277466, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8599018454551697, + "num_tokens": 531484737.0, + "step": 13935 + }, + { + "epoch": 1.772802442437349, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9266287088394165, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8608418107032776, + "num_tokens": 531523453.0, + "step": 13936 + }, + { + "epoch": 1.7729296527159395, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7936567068099976, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8711999654769897, + "num_tokens": 531559627.0, + "step": 13937 + }, + { + "epoch": 1.77305686299453, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7489781379699707, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8681092262268066, + "num_tokens": 531597677.0, + "step": 13938 + }, + { + "epoch": 1.7731840732731206, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6891123056411743, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8717699646949768, + "num_tokens": 531642753.0, + "step": 13939 + }, + { + "epoch": 1.773311283551711, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9427111148834229, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8574122190475464, + "num_tokens": 531688249.0, + "step": 13940 + }, + { + "epoch": 1.7734384938303016, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8374708890914917, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8777909874916077, + "num_tokens": 531727994.0, + "step": 13941 + }, + { + "epoch": 1.7735657041088921, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8904576301574707, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8655547499656677, + "num_tokens": 531764428.0, + "step": 13942 + }, + { + "epoch": 1.7736929143874827, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.864075779914856, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.858703076839447, + "num_tokens": 531806604.0, + "step": 13943 + }, + { + "epoch": 1.773820124666073, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7902942895889282, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8818241357803345, + "num_tokens": 531845504.0, + "step": 13944 + }, + { + "epoch": 1.7739473349446635, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8154741525650024, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8721721768379211, + "num_tokens": 531880962.0, + "step": 13945 + }, + { + "epoch": 1.774074545223254, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8853697776794434, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8614081144332886, + "num_tokens": 531918182.0, + "step": 13946 + }, + { + "epoch": 1.7742017555018446, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8844387531280518, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8655828833580017, + "num_tokens": 531958340.0, + "step": 13947 + }, + { + "epoch": 1.774328965780435, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9023443460464478, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8517763614654541, + "num_tokens": 531996951.0, + "step": 13948 + }, + { + "epoch": 1.7744561760590256, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8307421207427979, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8668324947357178, + "num_tokens": 532036963.0, + "step": 13949 + }, + { + "epoch": 1.774583386337616, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0423855781555176, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8577228784561157, + "num_tokens": 532070937.0, + "step": 13950 + }, + { + "epoch": 1.7747105966162064, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.94452965259552, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8497710227966309, + "num_tokens": 532107866.0, + "step": 13951 + }, + { + "epoch": 1.774837806894797, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8816720247268677, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8573542833328247, + "num_tokens": 532151485.0, + "step": 13952 + }, + { + "epoch": 1.7749650171733875, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9212844371795654, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8700923323631287, + "num_tokens": 532191209.0, + "step": 13953 + }, + { + "epoch": 1.775092227451978, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9119672775268555, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8585571646690369, + "num_tokens": 532232219.0, + "step": 13954 + }, + { + "epoch": 1.7752194377305686, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0423243045806885, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8515390157699585, + "num_tokens": 532265330.0, + "step": 13955 + }, + { + "epoch": 1.775346648009159, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.370084762573242, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8611618280410767, + "num_tokens": 532295964.0, + "step": 13956 + }, + { + "epoch": 1.7754738582877496, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8879585266113281, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8660293817520142, + "num_tokens": 532333804.0, + "step": 13957 + }, + { + "epoch": 1.7756010685663401, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9315335750579834, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8676817417144775, + "num_tokens": 532373088.0, + "step": 13958 + }, + { + "epoch": 1.7757282788449307, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.958336353302002, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8614081740379333, + "num_tokens": 532409473.0, + "step": 13959 + }, + { + "epoch": 1.7758554891235212, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7448039054870605, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8760333061218262, + "num_tokens": 532447883.0, + "step": 13960 + }, + { + "epoch": 1.7759826994021117, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8223235607147217, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8687764406204224, + "num_tokens": 532485783.0, + "step": 13961 + }, + { + "epoch": 1.7761099096807023, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.154067277908325, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8702073097229004, + "num_tokens": 532511082.0, + "step": 13962 + }, + { + "epoch": 1.7762371199592928, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.755614161491394, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8811959028244019, + "num_tokens": 532555516.0, + "step": 13963 + }, + { + "epoch": 1.7763643302378833, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0050482749938965, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8488311767578125, + "num_tokens": 532599200.0, + "step": 13964 + }, + { + "epoch": 1.7764915405164738, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9271609783172607, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8735007047653198, + "num_tokens": 532636335.0, + "step": 13965 + }, + { + "epoch": 1.7766187507950644, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7749359607696533, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8761211633682251, + "num_tokens": 532678337.0, + "step": 13966 + }, + { + "epoch": 1.776745961073655, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8109076023101807, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8608585596084595, + "num_tokens": 532715051.0, + "step": 13967 + }, + { + "epoch": 1.7768731713522452, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8206524848937988, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8738263845443726, + "num_tokens": 532750836.0, + "step": 13968 + }, + { + "epoch": 1.7770003816308357, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9736311435699463, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8700432181358337, + "num_tokens": 532790331.0, + "step": 13969 + }, + { + "epoch": 1.7771275919094263, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8090596199035645, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8605095744132996, + "num_tokens": 532830169.0, + "step": 13970 + }, + { + "epoch": 1.7772548021880168, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.853269100189209, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8576494455337524, + "num_tokens": 532865957.0, + "step": 13971 + }, + { + "epoch": 1.7773820124666073, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8197616338729858, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8606330752372742, + "num_tokens": 532906112.0, + "step": 13972 + }, + { + "epoch": 1.7775092227451978, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9786909818649292, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8720055818557739, + "num_tokens": 532939676.0, + "step": 13973 + }, + { + "epoch": 1.7776364330237884, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9565201997756958, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8463264107704163, + "num_tokens": 532977668.0, + "step": 13974 + }, + { + "epoch": 1.7777636433023787, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9961233139038086, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8609215021133423, + "num_tokens": 533013862.0, + "step": 13975 + }, + { + "epoch": 1.7778908535809692, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.167957067489624, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.85295170545578, + "num_tokens": 533051222.0, + "step": 13976 + }, + { + "epoch": 1.7780180638595597, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7088038921356201, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8613526225090027, + "num_tokens": 533090182.0, + "step": 13977 + }, + { + "epoch": 1.7781452741381503, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8425099849700928, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8761618137359619, + "num_tokens": 533129600.0, + "step": 13978 + }, + { + "epoch": 1.7782724844167408, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.7398815155029297, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8637410998344421, + "num_tokens": 533172351.0, + "step": 13979 + }, + { + "epoch": 1.7783996946953313, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9035736322402954, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8614658713340759, + "num_tokens": 533211583.0, + "step": 13980 + }, + { + "epoch": 1.7785269049739219, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8784791231155396, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8702803254127502, + "num_tokens": 533249960.0, + "step": 13981 + }, + { + "epoch": 1.7786541152525124, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8648760318756104, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8717312812805176, + "num_tokens": 533287328.0, + "step": 13982 + }, + { + "epoch": 1.778781325531103, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9173487424850464, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8651366233825684, + "num_tokens": 533327772.0, + "step": 13983 + }, + { + "epoch": 1.7789085358096934, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6917611360549927, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8575046062469482, + "num_tokens": 533372004.0, + "step": 13984 + }, + { + "epoch": 1.779035746088284, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8798335790634155, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8601976633071899, + "num_tokens": 533409502.0, + "step": 13985 + }, + { + "epoch": 1.7791629563668745, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8615199327468872, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.851908802986145, + "num_tokens": 533451160.0, + "step": 13986 + }, + { + "epoch": 1.779290166645465, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9634000062942505, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8717763423919678, + "num_tokens": 533480413.0, + "step": 13987 + }, + { + "epoch": 1.7794173769240555, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.904582142829895, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8513067960739136, + "num_tokens": 533518826.0, + "step": 13988 + }, + { + "epoch": 1.779544587202646, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7961870431900024, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8539900779724121, + "num_tokens": 533563760.0, + "step": 13989 + }, + { + "epoch": 1.7796717974812366, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.83511221408844, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8634703755378723, + "num_tokens": 533601499.0, + "step": 13990 + }, + { + "epoch": 1.7797990077598271, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9102100133895874, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8681914210319519, + "num_tokens": 533640757.0, + "step": 13991 + }, + { + "epoch": 1.7799262180384177, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7777243852615356, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8664387464523315, + "num_tokens": 533678047.0, + "step": 13992 + }, + { + "epoch": 1.780053428317008, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1864120960235596, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.863956093788147, + "num_tokens": 533720903.0, + "step": 13993 + }, + { + "epoch": 1.7801806385955985, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8705424070358276, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8657615780830383, + "num_tokens": 533759720.0, + "step": 13994 + }, + { + "epoch": 1.780307848874189, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0558385848999023, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8517906665802002, + "num_tokens": 533796122.0, + "step": 13995 + }, + { + "epoch": 1.7804350591527796, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8832019567489624, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8699589967727661, + "num_tokens": 533836326.0, + "step": 13996 + }, + { + "epoch": 1.78056226943137, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9564094543457031, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.87398761510849, + "num_tokens": 533867874.0, + "step": 13997 + }, + { + "epoch": 1.7806894797099606, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.042097806930542, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8750408887863159, + "num_tokens": 533902877.0, + "step": 13998 + }, + { + "epoch": 1.780816689988551, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8873318433761597, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8689090609550476, + "num_tokens": 533937440.0, + "step": 13999 + }, + { + "epoch": 1.7809439002671414, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9316325187683105, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8729645609855652, + "num_tokens": 533972308.0, + "step": 14000 + }, + { + "epoch": 1.781071110545732, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.001432180404663, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8503419756889343, + "num_tokens": 534011913.0, + "step": 14001 + }, + { + "epoch": 1.7811983208243225, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6719822883605957, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8750566840171814, + "num_tokens": 534057235.0, + "step": 14002 + }, + { + "epoch": 1.781325531102913, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.024101972579956, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8642096519470215, + "num_tokens": 534092850.0, + "step": 14003 + }, + { + "epoch": 1.7814527413815036, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8223451375961304, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8695030808448792, + "num_tokens": 534135310.0, + "step": 14004 + }, + { + "epoch": 1.781579951660094, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8095351457595825, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8634706735610962, + "num_tokens": 534176008.0, + "step": 14005 + }, + { + "epoch": 1.7817071619386846, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9243041276931763, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8639623522758484, + "num_tokens": 534208217.0, + "step": 14006 + }, + { + "epoch": 1.7818343722172751, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.810336947441101, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8676590323448181, + "num_tokens": 534247564.0, + "step": 14007 + }, + { + "epoch": 1.7819615824958657, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9936100244522095, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8459974527359009, + "num_tokens": 534281144.0, + "step": 14008 + }, + { + "epoch": 1.7820887927744562, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8557219505310059, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8543109893798828, + "num_tokens": 534317893.0, + "step": 14009 + }, + { + "epoch": 1.7822160030530467, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8661203384399414, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.861534833908081, + "num_tokens": 534356116.0, + "step": 14010 + }, + { + "epoch": 1.7823432133316373, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.060051202774048, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8370370864868164, + "num_tokens": 534390499.0, + "step": 14011 + }, + { + "epoch": 1.7824704236102278, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.156069755554199, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8591432571411133, + "num_tokens": 534424242.0, + "step": 14012 + }, + { + "epoch": 1.7825976338888183, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8963373899459839, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8552815914154053, + "num_tokens": 534467465.0, + "step": 14013 + }, + { + "epoch": 1.7827248441674088, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9440594911575317, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8578006029129028, + "num_tokens": 534502959.0, + "step": 14014 + }, + { + "epoch": 1.7828520544459994, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8074854612350464, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8644014596939087, + "num_tokens": 534549888.0, + "step": 14015 + }, + { + "epoch": 1.78297926472459, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8956693410873413, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8660544753074646, + "num_tokens": 534591686.0, + "step": 14016 + }, + { + "epoch": 1.7831064750031802, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7992254495620728, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8803661465644836, + "num_tokens": 534628584.0, + "step": 14017 + }, + { + "epoch": 1.7832336852817707, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9275364875793457, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8540186285972595, + "num_tokens": 534665684.0, + "step": 14018 + }, + { + "epoch": 1.7833608955603613, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9123727083206177, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8662005662918091, + "num_tokens": 534701604.0, + "step": 14019 + }, + { + "epoch": 1.7834881058389518, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8312135934829712, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8488430380821228, + "num_tokens": 534741765.0, + "step": 14020 + }, + { + "epoch": 1.7836153161175423, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8463208675384521, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8728992938995361, + "num_tokens": 534780331.0, + "step": 14021 + }, + { + "epoch": 1.7837425263961328, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.075075387954712, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8520755171775818, + "num_tokens": 534816254.0, + "step": 14022 + }, + { + "epoch": 1.7838697366747234, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0116024017333984, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8616845607757568, + "num_tokens": 534851367.0, + "step": 14023 + }, + { + "epoch": 1.7839969469533137, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9453299045562744, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8614507913589478, + "num_tokens": 534889077.0, + "step": 14024 + }, + { + "epoch": 1.7841241572319042, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9097832441329956, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8525733947753906, + "num_tokens": 534924249.0, + "step": 14025 + }, + { + "epoch": 1.7842513675104947, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9503183364868164, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8476760387420654, + "num_tokens": 534966370.0, + "step": 14026 + }, + { + "epoch": 1.7843785777890853, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7896583080291748, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8701035976409912, + "num_tokens": 535005869.0, + "step": 14027 + }, + { + "epoch": 1.7845057880676758, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0598549842834473, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8611807823181152, + "num_tokens": 535042057.0, + "step": 14028 + }, + { + "epoch": 1.7846329983462663, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9993641376495361, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8579598665237427, + "num_tokens": 535074410.0, + "step": 14029 + }, + { + "epoch": 1.7847602086248568, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.857162356376648, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8539212942123413, + "num_tokens": 535120356.0, + "step": 14030 + }, + { + "epoch": 1.7848874189034474, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0164413452148438, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.853832483291626, + "num_tokens": 535156939.0, + "step": 14031 + }, + { + "epoch": 1.785014629182038, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8943047523498535, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8654648065567017, + "num_tokens": 535195553.0, + "step": 14032 + }, + { + "epoch": 1.7851418394606284, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8247439861297607, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8689539432525635, + "num_tokens": 535235692.0, + "step": 14033 + }, + { + "epoch": 1.785269049739219, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9402304887771606, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.861088752746582, + "num_tokens": 535269265.0, + "step": 14034 + }, + { + "epoch": 1.7853962600178095, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7882609367370605, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8789899349212646, + "num_tokens": 535303825.0, + "step": 14035 + }, + { + "epoch": 1.7855234702964, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9071345329284668, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8569010496139526, + "num_tokens": 535338802.0, + "step": 14036 + }, + { + "epoch": 1.7856506805749905, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 16.606769561767578, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8692893981933594, + "num_tokens": 535376592.0, + "step": 14037 + }, + { + "epoch": 1.785777890853581, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.004767417907715, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8586565256118774, + "num_tokens": 535416528.0, + "step": 14038 + }, + { + "epoch": 1.7859051011321716, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9959982633590698, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8588436245918274, + "num_tokens": 535450892.0, + "step": 14039 + }, + { + "epoch": 1.7860323114107621, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9066945314407349, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8654279708862305, + "num_tokens": 535488198.0, + "step": 14040 + }, + { + "epoch": 1.7861595216893527, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.9649345874786377, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8609687089920044, + "num_tokens": 535526020.0, + "step": 14041 + }, + { + "epoch": 1.786286731967943, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.086764097213745, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.868505597114563, + "num_tokens": 535560400.0, + "step": 14042 + }, + { + "epoch": 1.7864139422465335, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9992445707321167, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.855291485786438, + "num_tokens": 535594699.0, + "step": 14043 + }, + { + "epoch": 1.786541152525124, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9056096076965332, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8672030568122864, + "num_tokens": 535627019.0, + "step": 14044 + }, + { + "epoch": 1.7866683628037145, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7520960569381714, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.872674286365509, + "num_tokens": 535667479.0, + "step": 14045 + }, + { + "epoch": 1.786795573082305, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9216125011444092, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8577593564987183, + "num_tokens": 535705211.0, + "step": 14046 + }, + { + "epoch": 1.7869227833608956, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9018378257751465, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8448294401168823, + "num_tokens": 535744907.0, + "step": 14047 + }, + { + "epoch": 1.787049993639486, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9338626861572266, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8618283271789551, + "num_tokens": 535782526.0, + "step": 14048 + }, + { + "epoch": 1.7871772039180764, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8471596240997314, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8788845539093018, + "num_tokens": 535824153.0, + "step": 14049 + }, + { + "epoch": 1.787304414196667, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9030587673187256, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8743549585342407, + "num_tokens": 535859019.0, + "step": 14050 + }, + { + "epoch": 1.7874316244752575, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0133988857269287, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8745469450950623, + "num_tokens": 535896866.0, + "step": 14051 + }, + { + "epoch": 1.787558834753848, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.984736442565918, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8646287322044373, + "num_tokens": 535937128.0, + "step": 14052 + }, + { + "epoch": 1.7876860450324386, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8984336853027344, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8801088929176331, + "num_tokens": 535972727.0, + "step": 14053 + }, + { + "epoch": 1.787813255311029, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8410496711730957, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8696125745773315, + "num_tokens": 536011491.0, + "step": 14054 + }, + { + "epoch": 1.7879404655896196, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9244813919067383, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8674563765525818, + "num_tokens": 536048074.0, + "step": 14055 + }, + { + "epoch": 1.7880676758682101, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9109621047973633, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8709384202957153, + "num_tokens": 536089554.0, + "step": 14056 + }, + { + "epoch": 1.7881948861468007, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.590160369873047, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8743679523468018, + "num_tokens": 536127672.0, + "step": 14057 + }, + { + "epoch": 1.7883220964253912, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.17433762550354, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8640345335006714, + "num_tokens": 536157517.0, + "step": 14058 + }, + { + "epoch": 1.7884493067039817, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1107561588287354, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8767534494400024, + "num_tokens": 536192855.0, + "step": 14059 + }, + { + "epoch": 1.7885765169825723, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.052046537399292, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8650408983230591, + "num_tokens": 536231727.0, + "step": 14060 + }, + { + "epoch": 1.7887037272611628, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.924168348312378, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8774645328521729, + "num_tokens": 536270738.0, + "step": 14061 + }, + { + "epoch": 1.7888309375397533, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7671911716461182, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.88133704662323, + "num_tokens": 536313006.0, + "step": 14062 + }, + { + "epoch": 1.7889581478183438, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8620736598968506, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.858978271484375, + "num_tokens": 536350697.0, + "step": 14063 + }, + { + "epoch": 1.7890853580969344, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7677699327468872, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8721314668655396, + "num_tokens": 536394252.0, + "step": 14064 + }, + { + "epoch": 1.789212568375525, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2264599800109863, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8613499999046326, + "num_tokens": 536426591.0, + "step": 14065 + }, + { + "epoch": 1.7893397786541152, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.074281930923462, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8511379361152649, + "num_tokens": 536468567.0, + "step": 14066 + }, + { + "epoch": 1.7894669889327057, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0653319358825684, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8546744585037231, + "num_tokens": 536506352.0, + "step": 14067 + }, + { + "epoch": 1.7895941992112963, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0120627880096436, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8579804301261902, + "num_tokens": 536538122.0, + "step": 14068 + }, + { + "epoch": 1.7897214094898868, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.861118197441101, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8555731773376465, + "num_tokens": 536574617.0, + "step": 14069 + }, + { + "epoch": 1.7898486197684773, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.78207528591156, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8767669200897217, + "num_tokens": 536611895.0, + "step": 14070 + }, + { + "epoch": 1.7899758300470678, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.263580322265625, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8577991724014282, + "num_tokens": 536651336.0, + "step": 14071 + }, + { + "epoch": 1.7901030403256584, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8519668579101562, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8590037226676941, + "num_tokens": 536692566.0, + "step": 14072 + }, + { + "epoch": 1.7902302506042487, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.834313154220581, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8698949217796326, + "num_tokens": 536730855.0, + "step": 14073 + }, + { + "epoch": 1.7903574608828392, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 80.52465057373047, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8615269064903259, + "num_tokens": 536766478.0, + "step": 14074 + }, + { + "epoch": 1.7904846711614297, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2450156211853027, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.857894778251648, + "num_tokens": 536799900.0, + "step": 14075 + }, + { + "epoch": 1.7906118814400203, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.045471429824829, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8703831434249878, + "num_tokens": 536843597.0, + "step": 14076 + }, + { + "epoch": 1.7907390917186108, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.009087324142456, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8744595646858215, + "num_tokens": 536879189.0, + "step": 14077 + }, + { + "epoch": 1.7908663019972013, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.912379264831543, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8689875602722168, + "num_tokens": 536918099.0, + "step": 14078 + }, + { + "epoch": 1.7909935122757918, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.888832688331604, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8551747798919678, + "num_tokens": 536954410.0, + "step": 14079 + }, + { + "epoch": 1.7911207225543824, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9300297498703003, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8856831789016724, + "num_tokens": 536986688.0, + "step": 14080 + }, + { + "epoch": 1.791247932832973, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.890047550201416, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8656367063522339, + "num_tokens": 537024181.0, + "step": 14081 + }, + { + "epoch": 1.7913751431115634, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8348326683044434, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8677586317062378, + "num_tokens": 537064174.0, + "step": 14082 + }, + { + "epoch": 1.791502353390154, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0124030113220215, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8555266857147217, + "num_tokens": 537100279.0, + "step": 14083 + }, + { + "epoch": 1.7916295636687445, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0535683631896973, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8406093120574951, + "num_tokens": 537135339.0, + "step": 14084 + }, + { + "epoch": 1.791756773947335, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9301955699920654, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8570257425308228, + "num_tokens": 537169608.0, + "step": 14085 + }, + { + "epoch": 1.7918839842259255, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8742471933364868, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8729185461997986, + "num_tokens": 537205560.0, + "step": 14086 + }, + { + "epoch": 1.792011194504516, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0228021144866943, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8527137041091919, + "num_tokens": 537238697.0, + "step": 14087 + }, + { + "epoch": 1.7921384047831066, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.260535955429077, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8597534894943237, + "num_tokens": 537282365.0, + "step": 14088 + }, + { + "epoch": 1.7922656150616971, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.967984676361084, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8716931939125061, + "num_tokens": 537320805.0, + "step": 14089 + }, + { + "epoch": 1.7923928253402877, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.183393716812134, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8675352931022644, + "num_tokens": 537354607.0, + "step": 14090 + }, + { + "epoch": 1.792520035618878, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.084583282470703, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8749630451202393, + "num_tokens": 537386672.0, + "step": 14091 + }, + { + "epoch": 1.7926472458974685, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9296895265579224, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8659048080444336, + "num_tokens": 537430882.0, + "step": 14092 + }, + { + "epoch": 1.792774456176059, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.490931987762451, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.866331934928894, + "num_tokens": 537465114.0, + "step": 14093 + }, + { + "epoch": 1.7929016664546495, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.931559681892395, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8584814071655273, + "num_tokens": 537502820.0, + "step": 14094 + }, + { + "epoch": 1.79302887673324, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9117498397827148, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8721978664398193, + "num_tokens": 537537422.0, + "step": 14095 + }, + { + "epoch": 1.7931560870118306, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.057765483856201, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8687658309936523, + "num_tokens": 537566828.0, + "step": 14096 + }, + { + "epoch": 1.793283297290421, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.60552978515625, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8614064455032349, + "num_tokens": 537606102.0, + "step": 14097 + }, + { + "epoch": 1.7934105075690114, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0945401191711426, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8710533380508423, + "num_tokens": 537638527.0, + "step": 14098 + }, + { + "epoch": 1.793537717847602, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8502259254455566, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8620954751968384, + "num_tokens": 537683375.0, + "step": 14099 + }, + { + "epoch": 1.7936649281261925, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.808294653892517, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8586610555648804, + "num_tokens": 537731354.0, + "step": 14100 + }, + { + "epoch": 1.793792138404783, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8593804836273193, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8613851070404053, + "num_tokens": 537770055.0, + "step": 14101 + }, + { + "epoch": 1.7939193486833735, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0617291927337646, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8430234789848328, + "num_tokens": 537807977.0, + "step": 14102 + }, + { + "epoch": 1.794046558961964, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7747995853424072, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8584455251693726, + "num_tokens": 537847809.0, + "step": 14103 + }, + { + "epoch": 1.7941737692405546, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0016543865203857, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8450405597686768, + "num_tokens": 537885506.0, + "step": 14104 + }, + { + "epoch": 1.7943009795191451, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.99616277217865, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8596951961517334, + "num_tokens": 537918936.0, + "step": 14105 + }, + { + "epoch": 1.7944281897977357, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9975038766860962, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8683305978775024, + "num_tokens": 537954205.0, + "step": 14106 + }, + { + "epoch": 1.7945554000763262, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9103062152862549, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8674390316009521, + "num_tokens": 537989190.0, + "step": 14107 + }, + { + "epoch": 1.7946826103549167, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.878771424293518, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8554174900054932, + "num_tokens": 538028852.0, + "step": 14108 + }, + { + "epoch": 1.7948098206335072, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0613858699798584, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8697679042816162, + "num_tokens": 538061245.0, + "step": 14109 + }, + { + "epoch": 1.7949370309120978, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.726452350616455, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8767575621604919, + "num_tokens": 538103238.0, + "step": 14110 + }, + { + "epoch": 1.7950642411906883, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8747087717056274, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8515185713768005, + "num_tokens": 538142692.0, + "step": 14111 + }, + { + "epoch": 1.7951914514692788, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 3.625858783721924, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8631503582000732, + "num_tokens": 538183155.0, + "step": 14112 + }, + { + "epoch": 1.7953186617478694, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9072701930999756, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8620129823684692, + "num_tokens": 538222063.0, + "step": 14113 + }, + { + "epoch": 1.7954458720264599, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0079009532928467, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8658050894737244, + "num_tokens": 538259847.0, + "step": 14114 + }, + { + "epoch": 1.7955730823050502, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.758415937423706, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8703404068946838, + "num_tokens": 538298557.0, + "step": 14115 + }, + { + "epoch": 1.7957002925836407, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8382986783981323, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8612882494926453, + "num_tokens": 538337926.0, + "step": 14116 + }, + { + "epoch": 1.7958275028622313, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0494160652160645, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8684265613555908, + "num_tokens": 538382755.0, + "step": 14117 + }, + { + "epoch": 1.7959547131408218, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0161824226379395, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8429759740829468, + "num_tokens": 538417639.0, + "step": 14118 + }, + { + "epoch": 1.7960819234194123, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.887956976890564, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8581823706626892, + "num_tokens": 538458444.0, + "step": 14119 + }, + { + "epoch": 1.7962091336980028, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7913272380828857, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.865746021270752, + "num_tokens": 538496580.0, + "step": 14120 + }, + { + "epoch": 1.7963363439765934, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7977368831634521, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8739683628082275, + "num_tokens": 538538474.0, + "step": 14121 + }, + { + "epoch": 1.7964635542551837, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7288988828659058, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8661179542541504, + "num_tokens": 538576828.0, + "step": 14122 + }, + { + "epoch": 1.7965907645337742, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0707857608795166, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8705121278762817, + "num_tokens": 538611630.0, + "step": 14123 + }, + { + "epoch": 1.7967179748123647, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7473039627075195, + "learning_rate": 1e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8403847217559814, + "num_tokens": 538659537.0, + "step": 14124 + }, + { + "epoch": 1.7968451850909553, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7059893608093262, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8726043701171875, + "num_tokens": 538700216.0, + "step": 14125 + }, + { + "epoch": 1.7969723953695458, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8864628076553345, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8612510561943054, + "num_tokens": 538737719.0, + "step": 14126 + }, + { + "epoch": 1.7970996056481363, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8443289995193481, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8714431524276733, + "num_tokens": 538774723.0, + "step": 14127 + }, + { + "epoch": 1.7972268159267268, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8197942972183228, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8720894455909729, + "num_tokens": 538815288.0, + "step": 14128 + }, + { + "epoch": 1.7973540262053174, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9458389282226562, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8641009330749512, + "num_tokens": 538858239.0, + "step": 14129 + }, + { + "epoch": 1.797481236483908, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.7130863666534424, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8663842678070068, + "num_tokens": 538896947.0, + "step": 14130 + }, + { + "epoch": 1.7976084467624984, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8508868217468262, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8611950874328613, + "num_tokens": 538936511.0, + "step": 14131 + }, + { + "epoch": 1.797735657041089, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0812151432037354, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8569590449333191, + "num_tokens": 538969198.0, + "step": 14132 + }, + { + "epoch": 1.7978628673196795, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.852986454963684, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8751187324523926, + "num_tokens": 539002258.0, + "step": 14133 + }, + { + "epoch": 1.79799007759827, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8151981830596924, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.850609302520752, + "num_tokens": 539039993.0, + "step": 14134 + }, + { + "epoch": 1.7981172878768605, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7260894775390625, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8609801530838013, + "num_tokens": 539080713.0, + "step": 14135 + }, + { + "epoch": 1.798244498155451, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7720403671264648, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8757184743881226, + "num_tokens": 539121115.0, + "step": 14136 + }, + { + "epoch": 1.7983717084340416, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6766934394836426, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8831141591072083, + "num_tokens": 539159889.0, + "step": 14137 + }, + { + "epoch": 1.7984989187126321, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7483105659484863, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.863972544670105, + "num_tokens": 539201508.0, + "step": 14138 + }, + { + "epoch": 1.7986261289912227, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.915906310081482, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8464658856391907, + "num_tokens": 539238640.0, + "step": 14139 + }, + { + "epoch": 1.798753339269813, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.849552869796753, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8597694635391235, + "num_tokens": 539274797.0, + "step": 14140 + }, + { + "epoch": 1.7988805495484035, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9491747617721558, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8524690270423889, + "num_tokens": 539308410.0, + "step": 14141 + }, + { + "epoch": 1.799007759826994, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8412474393844604, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8571674823760986, + "num_tokens": 539345578.0, + "step": 14142 + }, + { + "epoch": 1.7991349701055845, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3450706005096436, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8482526540756226, + "num_tokens": 539378946.0, + "step": 14143 + }, + { + "epoch": 1.799262180384175, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7926833629608154, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8770777583122253, + "num_tokens": 539417663.0, + "step": 14144 + }, + { + "epoch": 1.7993893906627656, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.635740280151367, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8786934018135071, + "num_tokens": 539459597.0, + "step": 14145 + }, + { + "epoch": 1.799516600941356, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.863017201423645, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8662928938865662, + "num_tokens": 539499579.0, + "step": 14146 + }, + { + "epoch": 1.7996438112199464, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.901240587234497, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8742240071296692, + "num_tokens": 539538505.0, + "step": 14147 + }, + { + "epoch": 1.799771021498537, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7713792324066162, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8575811386108398, + "num_tokens": 539574824.0, + "step": 14148 + }, + { + "epoch": 1.7998982317771275, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.7140185832977295, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8519322872161865, + "num_tokens": 539614181.0, + "step": 14149 + }, + { + "epoch": 1.800025442055718, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9247889518737793, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8512595295906067, + "num_tokens": 539656713.0, + "step": 14150 + }, + { + "epoch": 1.8001526523343085, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9170727729797363, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.861312747001648, + "num_tokens": 539695436.0, + "step": 14151 + }, + { + "epoch": 1.800279862612899, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.679341197013855, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8663432002067566, + "num_tokens": 539736549.0, + "step": 14152 + }, + { + "epoch": 1.8004070728914896, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.004034996032715, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8682762980461121, + "num_tokens": 539776326.0, + "step": 14153 + }, + { + "epoch": 1.8005342831700801, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.592966079711914, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8711972236633301, + "num_tokens": 539819665.0, + "step": 14154 + }, + { + "epoch": 1.8006614934486707, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8445780277252197, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8635630011558533, + "num_tokens": 539858175.0, + "step": 14155 + }, + { + "epoch": 1.8007887037272612, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8425109386444092, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8632386922836304, + "num_tokens": 539895152.0, + "step": 14156 + }, + { + "epoch": 1.8009159140058517, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7416349649429321, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8683658242225647, + "num_tokens": 539941033.0, + "step": 14157 + }, + { + "epoch": 1.8010431242844422, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9491596221923828, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8663102388381958, + "num_tokens": 539977959.0, + "step": 14158 + }, + { + "epoch": 1.8011703345630328, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9477739334106445, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8623721599578857, + "num_tokens": 540014857.0, + "step": 14159 + }, + { + "epoch": 1.8012975448416233, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8634281158447266, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8620415329933167, + "num_tokens": 540055775.0, + "step": 14160 + }, + { + "epoch": 1.8014247551202138, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.144289016723633, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8718633651733398, + "num_tokens": 540090287.0, + "step": 14161 + }, + { + "epoch": 1.8015519653988044, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9046088457107544, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8716557025909424, + "num_tokens": 540129139.0, + "step": 14162 + }, + { + "epoch": 1.8016791756773949, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9178102016448975, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8633538484573364, + "num_tokens": 540165187.0, + "step": 14163 + }, + { + "epoch": 1.8018063859559852, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7820285558700562, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.876203179359436, + "num_tokens": 540201113.0, + "step": 14164 + }, + { + "epoch": 1.8019335962345757, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8871238231658936, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8646878004074097, + "num_tokens": 540237448.0, + "step": 14165 + }, + { + "epoch": 1.8020608065131662, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7862591743469238, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.857029914855957, + "num_tokens": 540279703.0, + "step": 14166 + }, + { + "epoch": 1.8021880167917568, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.028425455093384, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8441811203956604, + "num_tokens": 540313071.0, + "step": 14167 + }, + { + "epoch": 1.8023152270703473, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.833138346672058, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8684177398681641, + "num_tokens": 540353940.0, + "step": 14168 + }, + { + "epoch": 1.8024424373489378, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.08123517036438, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8600078821182251, + "num_tokens": 540391241.0, + "step": 14169 + }, + { + "epoch": 1.8025696476275284, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0559017658233643, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8770058751106262, + "num_tokens": 540420205.0, + "step": 14170 + }, + { + "epoch": 1.8026968579061187, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0373144149780273, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8685498237609863, + "num_tokens": 540452121.0, + "step": 14171 + }, + { + "epoch": 1.8028240681847092, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8995221853256226, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8641907572746277, + "num_tokens": 540487415.0, + "step": 14172 + }, + { + "epoch": 1.8029512784632997, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0983009338378906, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8526678085327148, + "num_tokens": 540522293.0, + "step": 14173 + }, + { + "epoch": 1.8030784887418903, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.877789855003357, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8753657341003418, + "num_tokens": 540560345.0, + "step": 14174 + }, + { + "epoch": 1.8032056990204808, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.802345871925354, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8724541068077087, + "num_tokens": 540598299.0, + "step": 14175 + }, + { + "epoch": 1.8033329092990713, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7932416200637817, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8749033212661743, + "num_tokens": 540634480.0, + "step": 14176 + }, + { + "epoch": 1.8034601195776618, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7035436630249023, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8551288843154907, + "num_tokens": 540679473.0, + "step": 14177 + }, + { + "epoch": 1.8035873298562524, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8387506008148193, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8581777215003967, + "num_tokens": 540722259.0, + "step": 14178 + }, + { + "epoch": 1.803714540134843, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.056839942932129, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8686219453811646, + "num_tokens": 540758058.0, + "step": 14179 + }, + { + "epoch": 1.8038417504134334, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8532159328460693, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.859178900718689, + "num_tokens": 540798535.0, + "step": 14180 + }, + { + "epoch": 1.803968960692024, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7778093814849854, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8640797138214111, + "num_tokens": 540839423.0, + "step": 14181 + }, + { + "epoch": 1.8040961709706145, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7660166025161743, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8699522614479065, + "num_tokens": 540887512.0, + "step": 14182 + }, + { + "epoch": 1.804223381249205, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0408170223236084, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8670771718025208, + "num_tokens": 540922292.0, + "step": 14183 + }, + { + "epoch": 1.8043505915277955, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.765323281288147, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8710954189300537, + "num_tokens": 540964105.0, + "step": 14184 + }, + { + "epoch": 1.804477801806386, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 3.9500715732574463, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8630302548408508, + "num_tokens": 541001793.0, + "step": 14185 + }, + { + "epoch": 1.8046050120849766, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.887032151222229, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.870319128036499, + "num_tokens": 541041104.0, + "step": 14186 + }, + { + "epoch": 1.8047322223635671, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 6.357364654541016, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8584388494491577, + "num_tokens": 541079643.0, + "step": 14187 + }, + { + "epoch": 1.8048594326421576, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.892722487449646, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8698067665100098, + "num_tokens": 541118554.0, + "step": 14188 + }, + { + "epoch": 1.804986642920748, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9484987258911133, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8519909381866455, + "num_tokens": 541156893.0, + "step": 14189 + }, + { + "epoch": 1.8051138531993385, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7126421928405762, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8804288506507874, + "num_tokens": 541197283.0, + "step": 14190 + }, + { + "epoch": 1.805241063477929, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7955642938613892, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8789806365966797, + "num_tokens": 541232343.0, + "step": 14191 + }, + { + "epoch": 1.8053682737565195, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.983167290687561, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8564829230308533, + "num_tokens": 541270274.0, + "step": 14192 + }, + { + "epoch": 1.80549548403511, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9171077013015747, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8596208691596985, + "num_tokens": 541306018.0, + "step": 14193 + }, + { + "epoch": 1.8056226943137006, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.906537413597107, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8651614189147949, + "num_tokens": 541344974.0, + "step": 14194 + }, + { + "epoch": 1.805749904592291, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.037381172180176, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.873589038848877, + "num_tokens": 541378590.0, + "step": 14195 + }, + { + "epoch": 1.8058771148708814, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8914729356765747, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8665376901626587, + "num_tokens": 541415945.0, + "step": 14196 + }, + { + "epoch": 1.806004325149472, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7765649557113647, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8614528179168701, + "num_tokens": 541459577.0, + "step": 14197 + }, + { + "epoch": 1.8061315354280625, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.834835410118103, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8641481399536133, + "num_tokens": 541501041.0, + "step": 14198 + }, + { + "epoch": 1.806258745706653, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8854178190231323, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8580071926116943, + "num_tokens": 541538587.0, + "step": 14199 + }, + { + "epoch": 1.8063859559852435, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.05373215675354, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8505557775497437, + "num_tokens": 541582225.0, + "step": 14200 + }, + { + "epoch": 1.806513166263834, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.915149450302124, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8742856383323669, + "num_tokens": 541616956.0, + "step": 14201 + }, + { + "epoch": 1.8066403765424246, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0583746433258057, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8538557887077332, + "num_tokens": 541650814.0, + "step": 14202 + }, + { + "epoch": 1.8067675868210151, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7551401853561401, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8731570243835449, + "num_tokens": 541692167.0, + "step": 14203 + }, + { + "epoch": 1.8068947970996057, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8216867446899414, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8541154265403748, + "num_tokens": 541732427.0, + "step": 14204 + }, + { + "epoch": 1.8070220073781962, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9561892747879028, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8643466234207153, + "num_tokens": 541773256.0, + "step": 14205 + }, + { + "epoch": 1.8071492176567867, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9344121217727661, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.870150625705719, + "num_tokens": 541815008.0, + "step": 14206 + }, + { + "epoch": 1.8072764279353772, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.990257740020752, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.867074728012085, + "num_tokens": 541853462.0, + "step": 14207 + }, + { + "epoch": 1.8074036382139678, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0063295364379883, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8604878187179565, + "num_tokens": 541890487.0, + "step": 14208 + }, + { + "epoch": 1.8075308484925583, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.771161675453186, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.868105411529541, + "num_tokens": 541929857.0, + "step": 14209 + }, + { + "epoch": 1.8076580587711488, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8880220651626587, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8565157651901245, + "num_tokens": 541969923.0, + "step": 14210 + }, + { + "epoch": 1.8077852690497394, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8518503904342651, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8735194206237793, + "num_tokens": 542008802.0, + "step": 14211 + }, + { + "epoch": 1.8079124793283299, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.717208743095398, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8781707286834717, + "num_tokens": 542049276.0, + "step": 14212 + }, + { + "epoch": 1.8080396896069202, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8102530241012573, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8576345443725586, + "num_tokens": 542086921.0, + "step": 14213 + }, + { + "epoch": 1.8081668998855107, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8944079875946045, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8637949824333191, + "num_tokens": 542124291.0, + "step": 14214 + }, + { + "epoch": 1.8082941101641012, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.841227412223816, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8524223566055298, + "num_tokens": 542167600.0, + "step": 14215 + }, + { + "epoch": 1.8084213204426918, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0361173152923584, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8588862419128418, + "num_tokens": 542201134.0, + "step": 14216 + }, + { + "epoch": 1.8085485307212823, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.854610562324524, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8512005805969238, + "num_tokens": 542242590.0, + "step": 14217 + }, + { + "epoch": 1.8086757409998728, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8573352098464966, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8637152910232544, + "num_tokens": 542281945.0, + "step": 14218 + }, + { + "epoch": 1.8088029512784631, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.977018117904663, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8728405237197876, + "num_tokens": 542319389.0, + "step": 14219 + }, + { + "epoch": 1.8089301615570537, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0784168243408203, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8520466089248657, + "num_tokens": 542356912.0, + "step": 14220 + }, + { + "epoch": 1.8090573718356442, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.847579836845398, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.857701301574707, + "num_tokens": 542394054.0, + "step": 14221 + }, + { + "epoch": 1.8091845821142347, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8726677894592285, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8692033290863037, + "num_tokens": 542431063.0, + "step": 14222 + }, + { + "epoch": 1.8093117923928252, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8334484100341797, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8519592881202698, + "num_tokens": 542471398.0, + "step": 14223 + }, + { + "epoch": 1.8094390026714158, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7695837020874023, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8497293591499329, + "num_tokens": 542516288.0, + "step": 14224 + }, + { + "epoch": 1.8095662129500063, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1820220947265625, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8627884387969971, + "num_tokens": 542552292.0, + "step": 14225 + }, + { + "epoch": 1.8096934232285968, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.725084900856018, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8657097220420837, + "num_tokens": 542592380.0, + "step": 14226 + }, + { + "epoch": 1.8098206335071874, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8652812242507935, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8557238578796387, + "num_tokens": 542632698.0, + "step": 14227 + }, + { + "epoch": 1.8099478437857779, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9045360088348389, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.853967547416687, + "num_tokens": 542668234.0, + "step": 14228 + }, + { + "epoch": 1.8100750540643684, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.066256523132324, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8624767661094666, + "num_tokens": 542706205.0, + "step": 14229 + }, + { + "epoch": 1.810202264342959, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3611128330230713, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8619909286499023, + "num_tokens": 542737068.0, + "step": 14230 + }, + { + "epoch": 1.8103294746215495, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.005359172821045, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8762511014938354, + "num_tokens": 542769776.0, + "step": 14231 + }, + { + "epoch": 1.81045668490014, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9183458089828491, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8771803975105286, + "num_tokens": 542802440.0, + "step": 14232 + }, + { + "epoch": 1.8105838951787305, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8654223680496216, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8754358887672424, + "num_tokens": 542841158.0, + "step": 14233 + }, + { + "epoch": 1.810711105457321, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9538692235946655, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.859575092792511, + "num_tokens": 542876637.0, + "step": 14234 + }, + { + "epoch": 1.8108383157359116, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8854732513427734, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8707812428474426, + "num_tokens": 542915087.0, + "step": 14235 + }, + { + "epoch": 1.8109655260145021, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0611584186553955, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8565680384635925, + "num_tokens": 542950635.0, + "step": 14236 + }, + { + "epoch": 1.8110927362930926, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7791887521743774, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8604140281677246, + "num_tokens": 542990078.0, + "step": 14237 + }, + { + "epoch": 1.811219946571683, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9253864288330078, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8380763530731201, + "num_tokens": 543030333.0, + "step": 14238 + }, + { + "epoch": 1.8113471568502735, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9533361196517944, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8572129607200623, + "num_tokens": 543070537.0, + "step": 14239 + }, + { + "epoch": 1.811474367128864, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9515247344970703, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8612180948257446, + "num_tokens": 543109137.0, + "step": 14240 + }, + { + "epoch": 1.8116015774074545, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9126956462860107, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8559103608131409, + "num_tokens": 543146616.0, + "step": 14241 + }, + { + "epoch": 1.811728787686045, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0206685066223145, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.878128707408905, + "num_tokens": 543178433.0, + "step": 14242 + }, + { + "epoch": 1.8118559979646356, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.816442847251892, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8760865330696106, + "num_tokens": 543215086.0, + "step": 14243 + }, + { + "epoch": 1.811983208243226, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1681370735168457, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8469957113265991, + "num_tokens": 543252111.0, + "step": 14244 + }, + { + "epoch": 1.8121104185218164, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.745618462562561, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.870965301990509, + "num_tokens": 543291938.0, + "step": 14245 + }, + { + "epoch": 1.812237628800407, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.862670660018921, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8674992322921753, + "num_tokens": 543335141.0, + "step": 14246 + }, + { + "epoch": 1.8123648390789975, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8838844299316406, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8652867078781128, + "num_tokens": 543371305.0, + "step": 14247 + }, + { + "epoch": 1.812492049357588, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.902113676071167, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8750964403152466, + "num_tokens": 543407431.0, + "step": 14248 + }, + { + "epoch": 1.8126192596361785, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.71246337890625, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.862966775894165, + "num_tokens": 543450287.0, + "step": 14249 + }, + { + "epoch": 1.812746469914769, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8933184146881104, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.877183198928833, + "num_tokens": 543486521.0, + "step": 14250 + }, + { + "epoch": 1.8128736801933596, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8350428342819214, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8638087511062622, + "num_tokens": 543522157.0, + "step": 14251 + }, + { + "epoch": 1.8130008904719501, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.176499605178833, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8667527437210083, + "num_tokens": 543560503.0, + "step": 14252 + }, + { + "epoch": 1.8131281007505406, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9584476947784424, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8515021204948425, + "num_tokens": 543598943.0, + "step": 14253 + }, + { + "epoch": 1.8132553110291312, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7235054969787598, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8655028939247131, + "num_tokens": 543640466.0, + "step": 14254 + }, + { + "epoch": 1.8133825213077217, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.837829351425171, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8660281896591187, + "num_tokens": 543677581.0, + "step": 14255 + }, + { + "epoch": 1.8135097315863122, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9474745988845825, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8571412563323975, + "num_tokens": 543711220.0, + "step": 14256 + }, + { + "epoch": 1.8136369418649028, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.936719536781311, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.859749436378479, + "num_tokens": 543750109.0, + "step": 14257 + }, + { + "epoch": 1.8137641521434933, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9570977687835693, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8690375685691833, + "num_tokens": 543788040.0, + "step": 14258 + }, + { + "epoch": 1.8138913624220838, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1200456619262695, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8643348217010498, + "num_tokens": 543821037.0, + "step": 14259 + }, + { + "epoch": 1.8140185727006743, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8935554027557373, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.876109778881073, + "num_tokens": 543856331.0, + "step": 14260 + }, + { + "epoch": 1.8141457829792649, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8511089086532593, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8691061735153198, + "num_tokens": 543892898.0, + "step": 14261 + }, + { + "epoch": 1.8142729932578552, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9106258153915405, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8592120409011841, + "num_tokens": 543932858.0, + "step": 14262 + }, + { + "epoch": 1.8144002035364457, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0017857551574707, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8713570833206177, + "num_tokens": 543968145.0, + "step": 14263 + }, + { + "epoch": 1.8145274138150362, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.074103832244873, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8630667328834534, + "num_tokens": 544000495.0, + "step": 14264 + }, + { + "epoch": 1.8146546240936268, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8190617561340332, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8580766320228577, + "num_tokens": 544041345.0, + "step": 14265 + }, + { + "epoch": 1.8147818343722173, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8828463554382324, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8692297339439392, + "num_tokens": 544080084.0, + "step": 14266 + }, + { + "epoch": 1.8149090446508078, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9632467031478882, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8562088012695312, + "num_tokens": 544115828.0, + "step": 14267 + }, + { + "epoch": 1.8150362549293981, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.864034652709961, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8605344295501709, + "num_tokens": 544153213.0, + "step": 14268 + }, + { + "epoch": 1.8151634652079887, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9776045083999634, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8672797679901123, + "num_tokens": 544192383.0, + "step": 14269 + }, + { + "epoch": 1.8152906754865792, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.94504714012146, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8661826848983765, + "num_tokens": 544232034.0, + "step": 14270 + }, + { + "epoch": 1.8154178857651697, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0990796089172363, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8469706773757935, + "num_tokens": 544266242.0, + "step": 14271 + }, + { + "epoch": 1.8155450960437602, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0762405395507812, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8538813591003418, + "num_tokens": 544304995.0, + "step": 14272 + }, + { + "epoch": 1.8156723063223508, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0152204036712646, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8628364205360413, + "num_tokens": 544341950.0, + "step": 14273 + }, + { + "epoch": 1.8157995166009413, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0382180213928223, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.859106183052063, + "num_tokens": 544379535.0, + "step": 14274 + }, + { + "epoch": 1.8159267268795318, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9373968839645386, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8624656200408936, + "num_tokens": 544416939.0, + "step": 14275 + }, + { + "epoch": 1.8160539371581224, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8508877754211426, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8760315179824829, + "num_tokens": 544461476.0, + "step": 14276 + }, + { + "epoch": 1.8161811474367129, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8423118591308594, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8641440868377686, + "num_tokens": 544505051.0, + "step": 14277 + }, + { + "epoch": 1.8163083577153034, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7992888689041138, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8634529113769531, + "num_tokens": 544550408.0, + "step": 14278 + }, + { + "epoch": 1.816435567993894, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9462896585464478, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8622977137565613, + "num_tokens": 544586645.0, + "step": 14279 + }, + { + "epoch": 1.8165627782724845, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8677154779434204, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8564128875732422, + "num_tokens": 544628087.0, + "step": 14280 + }, + { + "epoch": 1.816689988551075, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.7555091381073, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8787282705307007, + "num_tokens": 544667961.0, + "step": 14281 + }, + { + "epoch": 1.8168171988296655, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9444563388824463, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8613893389701843, + "num_tokens": 544704919.0, + "step": 14282 + }, + { + "epoch": 1.816944409108256, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9345712661743164, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8686457276344299, + "num_tokens": 544741421.0, + "step": 14283 + }, + { + "epoch": 1.8170716193868466, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1492085456848145, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8674117922782898, + "num_tokens": 544776414.0, + "step": 14284 + }, + { + "epoch": 1.817198829665437, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8413249254226685, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8571821451187134, + "num_tokens": 544814964.0, + "step": 14285 + }, + { + "epoch": 1.8173260399440276, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8065519332885742, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8508566617965698, + "num_tokens": 544853381.0, + "step": 14286 + }, + { + "epoch": 1.817453250222618, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8156249523162842, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8669518232345581, + "num_tokens": 544892474.0, + "step": 14287 + }, + { + "epoch": 1.8175804605012085, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.815479040145874, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8706077933311462, + "num_tokens": 544927667.0, + "step": 14288 + }, + { + "epoch": 1.817707670779799, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.940574288368225, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8701860308647156, + "num_tokens": 544959780.0, + "step": 14289 + }, + { + "epoch": 1.8178348810583895, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.000098466873169, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8561801910400391, + "num_tokens": 545001832.0, + "step": 14290 + }, + { + "epoch": 1.81796209133698, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9774246215820312, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8720934391021729, + "num_tokens": 545038489.0, + "step": 14291 + }, + { + "epoch": 1.8180893016155706, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.223822593688965, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8442842960357666, + "num_tokens": 545069086.0, + "step": 14292 + }, + { + "epoch": 1.818216511894161, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.017003297805786, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8668786287307739, + "num_tokens": 545104564.0, + "step": 14293 + }, + { + "epoch": 1.8183437221727514, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8031638860702515, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8693730235099792, + "num_tokens": 545149036.0, + "step": 14294 + }, + { + "epoch": 1.818470932451342, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7867045402526855, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.875564694404602, + "num_tokens": 545183825.0, + "step": 14295 + }, + { + "epoch": 1.8185981427299325, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8215440511703491, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8670228719711304, + "num_tokens": 545221274.0, + "step": 14296 + }, + { + "epoch": 1.818725353008523, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9392781257629395, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8711336255073547, + "num_tokens": 545259242.0, + "step": 14297 + }, + { + "epoch": 1.8188525632871135, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0574562549591064, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.849018931388855, + "num_tokens": 545289380.0, + "step": 14298 + }, + { + "epoch": 1.818979773565704, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.992812156677246, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8667840957641602, + "num_tokens": 545321935.0, + "step": 14299 + }, + { + "epoch": 1.8191069838442946, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6511147022247314, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8711596727371216, + "num_tokens": 545365714.0, + "step": 14300 + }, + { + "epoch": 1.8192341941228851, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.818755865097046, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8538004159927368, + "num_tokens": 545406838.0, + "step": 14301 + }, + { + "epoch": 1.8193614044014756, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8749240636825562, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8604026436805725, + "num_tokens": 545443265.0, + "step": 14302 + }, + { + "epoch": 1.8194886146800662, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8663753271102905, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8738075494766235, + "num_tokens": 545479903.0, + "step": 14303 + }, + { + "epoch": 1.8196158249586567, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5749456882476807, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8712559342384338, + "num_tokens": 545513767.0, + "step": 14304 + }, + { + "epoch": 1.8197430352372472, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0386905670166016, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.849402904510498, + "num_tokens": 545550541.0, + "step": 14305 + }, + { + "epoch": 1.8198702455158378, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9190068244934082, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8645944595336914, + "num_tokens": 545590739.0, + "step": 14306 + }, + { + "epoch": 1.8199974557944283, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3566195964813232, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8502668142318726, + "num_tokens": 545627127.0, + "step": 14307 + }, + { + "epoch": 1.8201246660730188, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8566590547561646, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8683288097381592, + "num_tokens": 545665473.0, + "step": 14308 + }, + { + "epoch": 1.8202518763516093, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.068711996078491, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8792611360549927, + "num_tokens": 545708018.0, + "step": 14309 + }, + { + "epoch": 1.8203790866301999, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8644180297851562, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8689402937889099, + "num_tokens": 545745635.0, + "step": 14310 + }, + { + "epoch": 1.8205062969087902, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8459296226501465, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8531490564346313, + "num_tokens": 545785260.0, + "step": 14311 + }, + { + "epoch": 1.8206335071873807, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8851232528686523, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8609583377838135, + "num_tokens": 545821696.0, + "step": 14312 + }, + { + "epoch": 1.8207607174659712, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.908849835395813, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8774470090866089, + "num_tokens": 545857971.0, + "step": 14313 + }, + { + "epoch": 1.8208879277445618, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1704623699188232, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8529040217399597, + "num_tokens": 545891000.0, + "step": 14314 + }, + { + "epoch": 1.8210151380231523, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8129483461380005, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8491261005401611, + "num_tokens": 545932825.0, + "step": 14315 + }, + { + "epoch": 1.8211423483017428, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8585588932037354, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8536801338195801, + "num_tokens": 545972849.0, + "step": 14316 + }, + { + "epoch": 1.8212695585803331, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7460805177688599, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8765654563903809, + "num_tokens": 546011492.0, + "step": 14317 + }, + { + "epoch": 1.8213967688589237, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0286567211151123, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.858920693397522, + "num_tokens": 546048461.0, + "step": 14318 + }, + { + "epoch": 1.8215239791375142, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8774077892303467, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8736386299133301, + "num_tokens": 546087113.0, + "step": 14319 + }, + { + "epoch": 1.8216511894161047, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0004794597625732, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.867321789264679, + "num_tokens": 546124413.0, + "step": 14320 + }, + { + "epoch": 1.8217783996946952, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.668206810951233, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8754755258560181, + "num_tokens": 546165763.0, + "step": 14321 + }, + { + "epoch": 1.8219056099732858, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9503246545791626, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8649102449417114, + "num_tokens": 546203218.0, + "step": 14322 + }, + { + "epoch": 1.8220328202518763, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7940293550491333, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8541145324707031, + "num_tokens": 546247610.0, + "step": 14323 + }, + { + "epoch": 1.8221600305304668, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8284058570861816, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8563811779022217, + "num_tokens": 546288952.0, + "step": 14324 + }, + { + "epoch": 1.8222872408090574, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.804032802581787, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8749412894248962, + "num_tokens": 546322449.0, + "step": 14325 + }, + { + "epoch": 1.8224144510876479, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.83245849609375, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8697054386138916, + "num_tokens": 546362692.0, + "step": 14326 + }, + { + "epoch": 1.8225416613662384, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9712754487991333, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8488580584526062, + "num_tokens": 546398856.0, + "step": 14327 + }, + { + "epoch": 1.822668871644829, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9185125827789307, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8572394251823425, + "num_tokens": 546439633.0, + "step": 14328 + }, + { + "epoch": 1.8227960819234195, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.81602942943573, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8688555359840393, + "num_tokens": 546477282.0, + "step": 14329 + }, + { + "epoch": 1.82292329220201, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8415828943252563, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8752833604812622, + "num_tokens": 546514584.0, + "step": 14330 + }, + { + "epoch": 1.8230505024806005, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2432708740234375, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8653972148895264, + "num_tokens": 546550761.0, + "step": 14331 + }, + { + "epoch": 1.823177712759191, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9581348896026611, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8498544096946716, + "num_tokens": 546594157.0, + "step": 14332 + }, + { + "epoch": 1.8233049230377816, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7456691265106201, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8738619685173035, + "num_tokens": 546634671.0, + "step": 14333 + }, + { + "epoch": 1.823432133316372, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9647419452667236, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8640263080596924, + "num_tokens": 546672084.0, + "step": 14334 + }, + { + "epoch": 1.8235593435949626, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.189316511154175, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8572900891304016, + "num_tokens": 546701920.0, + "step": 14335 + }, + { + "epoch": 1.823686553873553, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9881385564804077, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8671295642852783, + "num_tokens": 546735776.0, + "step": 14336 + }, + { + "epoch": 1.8238137641521435, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.896107792854309, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8688362836837769, + "num_tokens": 546772991.0, + "step": 14337 + }, + { + "epoch": 1.823940974430734, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.031705379486084, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8581008315086365, + "num_tokens": 546805310.0, + "step": 14338 + }, + { + "epoch": 1.8240681847093245, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9613432884216309, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8694201707839966, + "num_tokens": 546838240.0, + "step": 14339 + }, + { + "epoch": 1.824195394987915, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8306388854980469, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8594129085540771, + "num_tokens": 546878484.0, + "step": 14340 + }, + { + "epoch": 1.8243226052665056, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9232292175292969, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8792999982833862, + "num_tokens": 546919747.0, + "step": 14341 + }, + { + "epoch": 1.8244498155450959, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.642240285873413, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8639812469482422, + "num_tokens": 546962614.0, + "step": 14342 + }, + { + "epoch": 1.8245770258236864, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.951781988143921, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8585506677627563, + "num_tokens": 547000059.0, + "step": 14343 + }, + { + "epoch": 1.824704236102277, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8855535984039307, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8673548102378845, + "num_tokens": 547037372.0, + "step": 14344 + }, + { + "epoch": 1.8248314463808675, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9133360385894775, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8503998517990112, + "num_tokens": 547071840.0, + "step": 14345 + }, + { + "epoch": 1.824958656659458, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9807740449905396, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8389744758605957, + "num_tokens": 547108916.0, + "step": 14346 + }, + { + "epoch": 1.8250858669380485, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9116051197052002, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.883081316947937, + "num_tokens": 547146991.0, + "step": 14347 + }, + { + "epoch": 1.825213077216639, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7802187204360962, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8750400543212891, + "num_tokens": 547184093.0, + "step": 14348 + }, + { + "epoch": 1.8253402874952296, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9418823719024658, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.870742917060852, + "num_tokens": 547217570.0, + "step": 14349 + }, + { + "epoch": 1.8254674977738201, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7963274717330933, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8620098233222961, + "num_tokens": 547259498.0, + "step": 14350 + }, + { + "epoch": 1.8255947080524106, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9424219131469727, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8640173077583313, + "num_tokens": 547298419.0, + "step": 14351 + }, + { + "epoch": 1.8257219183310012, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6734497547149658, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8721990585327148, + "num_tokens": 547342984.0, + "step": 14352 + }, + { + "epoch": 1.8258491286095917, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8794957399368286, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8645415306091309, + "num_tokens": 547383162.0, + "step": 14353 + }, + { + "epoch": 1.8259763388881822, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8457814455032349, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8858818411827087, + "num_tokens": 547416967.0, + "step": 14354 + }, + { + "epoch": 1.8261035491667728, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9710874557495117, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8670898079872131, + "num_tokens": 547454695.0, + "step": 14355 + }, + { + "epoch": 1.8262307594453633, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.104876756668091, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8742322325706482, + "num_tokens": 547494258.0, + "step": 14356 + }, + { + "epoch": 1.8263579697239538, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.883889079093933, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8596683740615845, + "num_tokens": 547531404.0, + "step": 14357 + }, + { + "epoch": 1.8264851800025443, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9897234439849854, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8699753284454346, + "num_tokens": 547566553.0, + "step": 14358 + }, + { + "epoch": 1.8266123902811349, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8235044479370117, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8568596839904785, + "num_tokens": 547605505.0, + "step": 14359 + }, + { + "epoch": 1.8267396005597252, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7841001749038696, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.872581958770752, + "num_tokens": 547643146.0, + "step": 14360 + }, + { + "epoch": 1.8268668108383157, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.829625129699707, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8618418574333191, + "num_tokens": 547683373.0, + "step": 14361 + }, + { + "epoch": 1.8269940211169062, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.160245180130005, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8586410284042358, + "num_tokens": 547716762.0, + "step": 14362 + }, + { + "epoch": 1.8271212313954968, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8394855260849, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8718038201332092, + "num_tokens": 547756907.0, + "step": 14363 + }, + { + "epoch": 1.8272484416740873, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7962327003479004, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8597546219825745, + "num_tokens": 547793545.0, + "step": 14364 + }, + { + "epoch": 1.8273756519526778, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9315710067749023, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8555099964141846, + "num_tokens": 547836845.0, + "step": 14365 + }, + { + "epoch": 1.8275028622312681, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8168689012527466, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.871657133102417, + "num_tokens": 547875849.0, + "step": 14366 + }, + { + "epoch": 1.8276300725098586, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3069136142730713, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8584287166595459, + "num_tokens": 547918672.0, + "step": 14367 + }, + { + "epoch": 1.8277572827884492, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.111236095428467, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.851298451423645, + "num_tokens": 547958681.0, + "step": 14368 + }, + { + "epoch": 1.8278844930670397, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8975276947021484, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.859039306640625, + "num_tokens": 547996940.0, + "step": 14369 + }, + { + "epoch": 1.8280117033456302, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9821465015411377, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8735731840133667, + "num_tokens": 548030188.0, + "step": 14370 + }, + { + "epoch": 1.8281389136242208, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0500190258026123, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8799667358398438, + "num_tokens": 548066891.0, + "step": 14371 + }, + { + "epoch": 1.8282661239028113, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9188323020935059, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8702508211135864, + "num_tokens": 548104664.0, + "step": 14372 + }, + { + "epoch": 1.8283933341814018, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8207430839538574, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8595679998397827, + "num_tokens": 548143938.0, + "step": 14373 + }, + { + "epoch": 1.8285205444599923, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.056605339050293, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8631093502044678, + "num_tokens": 548183213.0, + "step": 14374 + }, + { + "epoch": 1.8286477547385829, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 7.720081329345703, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8483977317810059, + "num_tokens": 548221304.0, + "step": 14375 + }, + { + "epoch": 1.8287749650171734, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.202028512954712, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8611993789672852, + "num_tokens": 548261123.0, + "step": 14376 + }, + { + "epoch": 1.828902175295764, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0199944972991943, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8435847759246826, + "num_tokens": 548305040.0, + "step": 14377 + }, + { + "epoch": 1.8290293855743545, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9723361730575562, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8721502423286438, + "num_tokens": 548338538.0, + "step": 14378 + }, + { + "epoch": 1.829156595852945, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8701893091201782, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8681728839874268, + "num_tokens": 548378510.0, + "step": 14379 + }, + { + "epoch": 1.8292838061315355, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.814656138420105, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8582825064659119, + "num_tokens": 548415790.0, + "step": 14380 + }, + { + "epoch": 1.829411016410126, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9460378885269165, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8558264374732971, + "num_tokens": 548447294.0, + "step": 14381 + }, + { + "epoch": 1.8295382266887166, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.037738084793091, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8708893656730652, + "num_tokens": 548484617.0, + "step": 14382 + }, + { + "epoch": 1.829665436967307, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7787108421325684, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8598732948303223, + "num_tokens": 548527552.0, + "step": 14383 + }, + { + "epoch": 1.8297926472458976, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8609528541564941, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8820856809616089, + "num_tokens": 548561322.0, + "step": 14384 + }, + { + "epoch": 1.829919857524488, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9022969007492065, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8750461339950562, + "num_tokens": 548594810.0, + "step": 14385 + }, + { + "epoch": 1.8300470678030785, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2003421783447266, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.856621503829956, + "num_tokens": 548631094.0, + "step": 14386 + }, + { + "epoch": 1.830174278081669, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8138622045516968, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8634253740310669, + "num_tokens": 548669162.0, + "step": 14387 + }, + { + "epoch": 1.8303014883602595, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9782633781433105, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8661755323410034, + "num_tokens": 548704555.0, + "step": 14388 + }, + { + "epoch": 1.83042869863885, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8777921199798584, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8776403069496155, + "num_tokens": 548741852.0, + "step": 14389 + }, + { + "epoch": 1.8305559089174406, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7748805284500122, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8521251678466797, + "num_tokens": 548788455.0, + "step": 14390 + }, + { + "epoch": 1.8306831191960309, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.875028133392334, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8724749088287354, + "num_tokens": 548826347.0, + "step": 14391 + }, + { + "epoch": 1.8308103294746214, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1915481090545654, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8576458096504211, + "num_tokens": 548865428.0, + "step": 14392 + }, + { + "epoch": 1.830937539753212, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4479429721832275, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8506352305412292, + "num_tokens": 548901671.0, + "step": 14393 + }, + { + "epoch": 1.8310647500318025, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9259662628173828, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8616132736206055, + "num_tokens": 548939234.0, + "step": 14394 + }, + { + "epoch": 1.831191960310393, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8233320713043213, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8684424757957458, + "num_tokens": 548975334.0, + "step": 14395 + }, + { + "epoch": 1.8313191705889835, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9368305206298828, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8592931032180786, + "num_tokens": 549016372.0, + "step": 14396 + }, + { + "epoch": 1.831446380867574, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6862410306930542, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8711576461791992, + "num_tokens": 549058049.0, + "step": 14397 + }, + { + "epoch": 1.8315735911461646, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.825403094291687, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.853050947189331, + "num_tokens": 549097855.0, + "step": 14398 + }, + { + "epoch": 1.831700801424755, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7827805280685425, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8837014436721802, + "num_tokens": 549130906.0, + "step": 14399 + }, + { + "epoch": 1.8318280117033456, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.82943594455719, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8452863693237305, + "num_tokens": 549176574.0, + "step": 14400 + }, + { + "epoch": 1.8319552219819362, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0435400009155273, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8619992733001709, + "num_tokens": 549213945.0, + "step": 14401 + }, + { + "epoch": 1.8320824322605267, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7758610248565674, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8561122417449951, + "num_tokens": 549255843.0, + "step": 14402 + }, + { + "epoch": 1.8322096425391172, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9688833951950073, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8667284846305847, + "num_tokens": 549292573.0, + "step": 14403 + }, + { + "epoch": 1.8323368528177078, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7455189228057861, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.877182126045227, + "num_tokens": 549335291.0, + "step": 14404 + }, + { + "epoch": 1.8324640630962983, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7304484844207764, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8523240685462952, + "num_tokens": 549381337.0, + "step": 14405 + }, + { + "epoch": 1.8325912733748888, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8329477310180664, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8758828043937683, + "num_tokens": 549418180.0, + "step": 14406 + }, + { + "epoch": 1.8327184836534793, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7198078632354736, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8724679350852966, + "num_tokens": 549461623.0, + "step": 14407 + }, + { + "epoch": 1.8328456939320699, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7553311586380005, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8674821853637695, + "num_tokens": 549500791.0, + "step": 14408 + }, + { + "epoch": 1.8329729042106602, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.875884771347046, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8696898818016052, + "num_tokens": 549533343.0, + "step": 14409 + }, + { + "epoch": 1.8331001144892507, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.82288658618927, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8550757169723511, + "num_tokens": 549572707.0, + "step": 14410 + }, + { + "epoch": 1.8332273247678412, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8763117790222168, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8712432384490967, + "num_tokens": 549609772.0, + "step": 14411 + }, + { + "epoch": 1.8333545350464318, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8734898567199707, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.853046178817749, + "num_tokens": 549648626.0, + "step": 14412 + }, + { + "epoch": 1.8334817453250223, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8276827335357666, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8529031276702881, + "num_tokens": 549693211.0, + "step": 14413 + }, + { + "epoch": 1.8336089556036128, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.048675298690796, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8591215014457703, + "num_tokens": 549729463.0, + "step": 14414 + }, + { + "epoch": 1.8337361658822031, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7875759601593018, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8701725006103516, + "num_tokens": 549769139.0, + "step": 14415 + }, + { + "epoch": 1.8338633761607936, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3500027656555176, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8687014579772949, + "num_tokens": 549807067.0, + "step": 14416 + }, + { + "epoch": 1.8339905864393842, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 3.1876919269561768, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8629702925682068, + "num_tokens": 549842689.0, + "step": 14417 + }, + { + "epoch": 1.8341177967179747, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.923782229423523, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.882282018661499, + "num_tokens": 549876108.0, + "step": 14418 + }, + { + "epoch": 1.8342450069965652, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9056273698806763, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8784228563308716, + "num_tokens": 549909656.0, + "step": 14419 + }, + { + "epoch": 1.8343722172751558, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9444358348846436, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.856316864490509, + "num_tokens": 549949156.0, + "step": 14420 + }, + { + "epoch": 1.8344994275537463, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8805168867111206, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8709213733673096, + "num_tokens": 549990256.0, + "step": 14421 + }, + { + "epoch": 1.8346266378323368, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9060616493225098, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.872552752494812, + "num_tokens": 550023557.0, + "step": 14422 + }, + { + "epoch": 1.8347538481109273, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.728318452835083, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8609333634376526, + "num_tokens": 550069402.0, + "step": 14423 + }, + { + "epoch": 1.8348810583895179, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.005349636077881, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8718566298484802, + "num_tokens": 550110860.0, + "step": 14424 + }, + { + "epoch": 1.8350082686681084, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3951447010040283, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8489046692848206, + "num_tokens": 550151736.0, + "step": 14425 + }, + { + "epoch": 1.835135478946699, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9058187007904053, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8655036687850952, + "num_tokens": 550187731.0, + "step": 14426 + }, + { + "epoch": 1.8352626892252895, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9829158782958984, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8661218881607056, + "num_tokens": 550220545.0, + "step": 14427 + }, + { + "epoch": 1.83538989950388, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8500025272369385, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8670474290847778, + "num_tokens": 550254755.0, + "step": 14428 + }, + { + "epoch": 1.8355171097824705, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7406708002090454, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8707189559936523, + "num_tokens": 550295191.0, + "step": 14429 + }, + { + "epoch": 1.835644320061061, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9072833061218262, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8571573495864868, + "num_tokens": 550332984.0, + "step": 14430 + }, + { + "epoch": 1.8357715303396516, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5939655303955078, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8677785396575928, + "num_tokens": 550377470.0, + "step": 14431 + }, + { + "epoch": 1.835898740618242, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.801349401473999, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8685345649719238, + "num_tokens": 550416123.0, + "step": 14432 + }, + { + "epoch": 1.8360259508968326, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.858561635017395, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8702282309532166, + "num_tokens": 550458062.0, + "step": 14433 + }, + { + "epoch": 1.836153161175423, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9234281778335571, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8575618267059326, + "num_tokens": 550496485.0, + "step": 14434 + }, + { + "epoch": 1.8362803714540135, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0897974967956543, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8654651641845703, + "num_tokens": 550536500.0, + "step": 14435 + }, + { + "epoch": 1.836407581732604, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9515775442123413, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8583821058273315, + "num_tokens": 550575611.0, + "step": 14436 + }, + { + "epoch": 1.8365347920111945, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7881234884262085, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8558872938156128, + "num_tokens": 550616431.0, + "step": 14437 + }, + { + "epoch": 1.836662002289785, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.839881420135498, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8738932013511658, + "num_tokens": 550653626.0, + "step": 14438 + }, + { + "epoch": 1.8367892125683756, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.911693811416626, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8599684238433838, + "num_tokens": 550690899.0, + "step": 14439 + }, + { + "epoch": 1.8369164228469659, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.834333062171936, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8645220994949341, + "num_tokens": 550734408.0, + "step": 14440 + }, + { + "epoch": 1.8370436331255564, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0355467796325684, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8524263501167297, + "num_tokens": 550767888.0, + "step": 14441 + }, + { + "epoch": 1.837170843404147, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8310267925262451, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8660053610801697, + "num_tokens": 550807192.0, + "step": 14442 + }, + { + "epoch": 1.8372980536827375, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7898660898208618, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8734691143035889, + "num_tokens": 550848006.0, + "step": 14443 + }, + { + "epoch": 1.837425263961328, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7825783491134644, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8648422956466675, + "num_tokens": 550888350.0, + "step": 14444 + }, + { + "epoch": 1.8375524742399185, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4295952320098877, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.870890736579895, + "num_tokens": 550925389.0, + "step": 14445 + }, + { + "epoch": 1.837679684518509, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9852384328842163, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8707877397537231, + "num_tokens": 550962610.0, + "step": 14446 + }, + { + "epoch": 1.8378068947970996, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9665981531143188, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8614877462387085, + "num_tokens": 551001018.0, + "step": 14447 + }, + { + "epoch": 1.83793410507569, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1229841709136963, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8656401634216309, + "num_tokens": 551038505.0, + "step": 14448 + }, + { + "epoch": 1.8380613153542806, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8148300647735596, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8764041662216187, + "num_tokens": 551072235.0, + "step": 14449 + }, + { + "epoch": 1.8381885256328712, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9708658456802368, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8770990371704102, + "num_tokens": 551110129.0, + "step": 14450 + }, + { + "epoch": 1.8383157359114617, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8162511587142944, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8661729693412781, + "num_tokens": 551152380.0, + "step": 14451 + }, + { + "epoch": 1.8384429461900522, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7238965034484863, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8752462267875671, + "num_tokens": 551195942.0, + "step": 14452 + }, + { + "epoch": 1.8385701564686427, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8435039520263672, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8574681282043457, + "num_tokens": 551238502.0, + "step": 14453 + }, + { + "epoch": 1.8386973667472333, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7282301187515259, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8740196228027344, + "num_tokens": 551278091.0, + "step": 14454 + }, + { + "epoch": 1.8388245770258238, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9116045236587524, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8438535928726196, + "num_tokens": 551314940.0, + "step": 14455 + }, + { + "epoch": 1.8389517873044143, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8018460273742676, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8510183095932007, + "num_tokens": 551356427.0, + "step": 14456 + }, + { + "epoch": 1.8390789975830049, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.836228609085083, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8689571619033813, + "num_tokens": 551400436.0, + "step": 14457 + }, + { + "epoch": 1.8392062078615952, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8120156526565552, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.863434910774231, + "num_tokens": 551438335.0, + "step": 14458 + }, + { + "epoch": 1.8393334181401857, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9055875539779663, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8658949136734009, + "num_tokens": 551471214.0, + "step": 14459 + }, + { + "epoch": 1.8394606284187762, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1207692623138428, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8514487147331238, + "num_tokens": 551504071.0, + "step": 14460 + }, + { + "epoch": 1.8395878386973668, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9162462949752808, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8597918152809143, + "num_tokens": 551538456.0, + "step": 14461 + }, + { + "epoch": 1.8397150489759573, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8851920366287231, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8682775497436523, + "num_tokens": 551575334.0, + "step": 14462 + }, + { + "epoch": 1.8398422592545478, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.821259617805481, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8730571269989014, + "num_tokens": 551617127.0, + "step": 14463 + }, + { + "epoch": 1.8399694695331381, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8085522651672363, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8703255653381348, + "num_tokens": 551655548.0, + "step": 14464 + }, + { + "epoch": 1.8400966798117286, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0838425159454346, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8613072037696838, + "num_tokens": 551691807.0, + "step": 14465 + }, + { + "epoch": 1.8402238900903192, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 3.56215763092041, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8726445436477661, + "num_tokens": 551729805.0, + "step": 14466 + }, + { + "epoch": 1.8403511003689097, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 7.757126808166504, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8419985771179199, + "num_tokens": 551775220.0, + "step": 14467 + }, + { + "epoch": 1.8404783106475002, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0072455406188965, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8565928936004639, + "num_tokens": 551816242.0, + "step": 14468 + }, + { + "epoch": 1.8406055209260908, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.907713770866394, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8536574840545654, + "num_tokens": 551854929.0, + "step": 14469 + }, + { + "epoch": 1.8407327312046813, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8082242012023926, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8627495765686035, + "num_tokens": 551898877.0, + "step": 14470 + }, + { + "epoch": 1.8408599414832718, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9442936182022095, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8803737163543701, + "num_tokens": 551934252.0, + "step": 14471 + }, + { + "epoch": 1.8409871517618623, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7880111932754517, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8756275177001953, + "num_tokens": 551974779.0, + "step": 14472 + }, + { + "epoch": 1.8411143620404529, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.82274329662323, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8649951219558716, + "num_tokens": 552011627.0, + "step": 14473 + }, + { + "epoch": 1.8412415723190434, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.976946473121643, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8432970643043518, + "num_tokens": 552045333.0, + "step": 14474 + }, + { + "epoch": 1.841368782597634, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9650452136993408, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8579301834106445, + "num_tokens": 552079974.0, + "step": 14475 + }, + { + "epoch": 1.8414959928762245, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.839070200920105, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8720377683639526, + "num_tokens": 552117507.0, + "step": 14476 + }, + { + "epoch": 1.841623203154815, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.4579432010650635, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8627707362174988, + "num_tokens": 552156009.0, + "step": 14477 + }, + { + "epoch": 1.8417504134334055, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.149488687515259, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.872100293636322, + "num_tokens": 552189092.0, + "step": 14478 + }, + { + "epoch": 1.841877623711996, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8683072328567505, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8625417947769165, + "num_tokens": 552228305.0, + "step": 14479 + }, + { + "epoch": 1.8420048339905866, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7472409009933472, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8662077784538269, + "num_tokens": 552269402.0, + "step": 14480 + }, + { + "epoch": 1.842132044269177, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0293729305267334, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8612669706344604, + "num_tokens": 552304870.0, + "step": 14481 + }, + { + "epoch": 1.8422592545477676, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.030179977416992, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8592909574508667, + "num_tokens": 552336802.0, + "step": 14482 + }, + { + "epoch": 1.842386464826358, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.82356595993042, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8777469992637634, + "num_tokens": 552376124.0, + "step": 14483 + }, + { + "epoch": 1.8425136751049485, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.832661509513855, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8562949895858765, + "num_tokens": 552412285.0, + "step": 14484 + }, + { + "epoch": 1.842640885383539, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8739643096923828, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8645620942115784, + "num_tokens": 552447527.0, + "step": 14485 + }, + { + "epoch": 1.8427680956621295, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7274765968322754, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8674503564834595, + "num_tokens": 552486569.0, + "step": 14486 + }, + { + "epoch": 1.84289530594072, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8218772411346436, + "learning_rate": 1e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8292859792709351, + "num_tokens": 552529158.0, + "step": 14487 + }, + { + "epoch": 1.8430225162193106, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9018893241882324, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8627984523773193, + "num_tokens": 552572183.0, + "step": 14488 + }, + { + "epoch": 1.8431497264979009, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8107874393463135, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8627718687057495, + "num_tokens": 552609785.0, + "step": 14489 + }, + { + "epoch": 1.8432769367764914, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.945083737373352, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8714281320571899, + "num_tokens": 552641243.0, + "step": 14490 + }, + { + "epoch": 1.843404147055082, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.955434799194336, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8571124076843262, + "num_tokens": 552677595.0, + "step": 14491 + }, + { + "epoch": 1.8435313573336725, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.937735676765442, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8650892972946167, + "num_tokens": 552715492.0, + "step": 14492 + }, + { + "epoch": 1.843658567612263, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7079590559005737, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.864184558391571, + "num_tokens": 552757410.0, + "step": 14493 + }, + { + "epoch": 1.8437857778908535, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7391778230667114, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8806139826774597, + "num_tokens": 552793794.0, + "step": 14494 + }, + { + "epoch": 1.843912988169444, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7951278686523438, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8503104448318481, + "num_tokens": 552836010.0, + "step": 14495 + }, + { + "epoch": 1.8440401984480346, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8068822622299194, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8680850267410278, + "num_tokens": 552874136.0, + "step": 14496 + }, + { + "epoch": 1.844167408726625, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.030163049697876, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8616312742233276, + "num_tokens": 552912314.0, + "step": 14497 + }, + { + "epoch": 1.8442946190052156, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8204468488693237, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8531090021133423, + "num_tokens": 552953998.0, + "step": 14498 + }, + { + "epoch": 1.8444218292838062, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8461500406265259, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8848993182182312, + "num_tokens": 552987310.0, + "step": 14499 + }, + { + "epoch": 1.8445490395623967, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.941985011100769, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8614733815193176, + "num_tokens": 553019830.0, + "step": 14500 + }, + { + "epoch": 1.8446762498409872, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7276886701583862, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8631672263145447, + "num_tokens": 553066695.0, + "step": 14501 + }, + { + "epoch": 1.8448034601195777, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.031581401824951, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8567521572113037, + "num_tokens": 553098943.0, + "step": 14502 + }, + { + "epoch": 1.8449306703981683, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9322118759155273, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8696810007095337, + "num_tokens": 553136106.0, + "step": 14503 + }, + { + "epoch": 1.8450578806767588, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1073668003082275, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8700085878372192, + "num_tokens": 553170014.0, + "step": 14504 + }, + { + "epoch": 1.8451850909553493, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8677719831466675, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8589667081832886, + "num_tokens": 553213297.0, + "step": 14505 + }, + { + "epoch": 1.8453123012339399, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9573612213134766, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8661152720451355, + "num_tokens": 553248559.0, + "step": 14506 + }, + { + "epoch": 1.8454395115125302, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9160473346710205, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8659380674362183, + "num_tokens": 553289140.0, + "step": 14507 + }, + { + "epoch": 1.8455667217911207, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.83809494972229, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8573353290557861, + "num_tokens": 553330619.0, + "step": 14508 + }, + { + "epoch": 1.8456939320697112, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8234145641326904, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8738936185836792, + "num_tokens": 553371295.0, + "step": 14509 + }, + { + "epoch": 1.8458211423483017, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.89889657497406, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8609004616737366, + "num_tokens": 553410237.0, + "step": 14510 + }, + { + "epoch": 1.8459483526268923, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0503664016723633, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8720319271087646, + "num_tokens": 553446727.0, + "step": 14511 + }, + { + "epoch": 1.8460755629054828, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0535786151885986, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8707388043403625, + "num_tokens": 553488626.0, + "step": 14512 + }, + { + "epoch": 1.846202773184073, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8912016153335571, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.856499969959259, + "num_tokens": 553528294.0, + "step": 14513 + }, + { + "epoch": 1.8463299834626636, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.088379144668579, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8689088821411133, + "num_tokens": 553563738.0, + "step": 14514 + }, + { + "epoch": 1.8464571937412542, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.121778726577759, + "learning_rate": 1e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8313158750534058, + "num_tokens": 553603351.0, + "step": 14515 + }, + { + "epoch": 1.8465844040198447, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9447503089904785, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8630852699279785, + "num_tokens": 553643655.0, + "step": 14516 + }, + { + "epoch": 1.8467116142984352, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9861572980880737, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8762210607528687, + "num_tokens": 553677944.0, + "step": 14517 + }, + { + "epoch": 1.8468388245770258, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9444830417633057, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8584222793579102, + "num_tokens": 553712120.0, + "step": 14518 + }, + { + "epoch": 1.8469660348556163, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8294947147369385, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8694683313369751, + "num_tokens": 553751442.0, + "step": 14519 + }, + { + "epoch": 1.8470932451342068, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8709384202957153, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.871172308921814, + "num_tokens": 553782924.0, + "step": 14520 + }, + { + "epoch": 1.8472204554127973, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9365845918655396, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8566851615905762, + "num_tokens": 553818861.0, + "step": 14521 + }, + { + "epoch": 1.8473476656913879, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9394840002059937, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8626699447631836, + "num_tokens": 553855593.0, + "step": 14522 + }, + { + "epoch": 1.8474748759699784, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.946446180343628, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8628631830215454, + "num_tokens": 553896028.0, + "step": 14523 + }, + { + "epoch": 1.847602086248569, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9913883209228516, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8681988716125488, + "num_tokens": 553935212.0, + "step": 14524 + }, + { + "epoch": 1.8477292965271594, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.967606544494629, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8626285195350647, + "num_tokens": 553971231.0, + "step": 14525 + }, + { + "epoch": 1.84785650680575, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9726933240890503, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8737058639526367, + "num_tokens": 554006782.0, + "step": 14526 + }, + { + "epoch": 1.8479837170843405, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.856446385383606, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8584983348846436, + "num_tokens": 554043427.0, + "step": 14527 + }, + { + "epoch": 1.848110927362931, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8335881233215332, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8541901707649231, + "num_tokens": 554084187.0, + "step": 14528 + }, + { + "epoch": 1.8482381376415216, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0481324195861816, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.863551139831543, + "num_tokens": 554122186.0, + "step": 14529 + }, + { + "epoch": 1.848365347920112, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0045464038848877, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8646646738052368, + "num_tokens": 554161388.0, + "step": 14530 + }, + { + "epoch": 1.8484925581987026, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0284855365753174, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8574739694595337, + "num_tokens": 554199837.0, + "step": 14531 + }, + { + "epoch": 1.848619768477293, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8044873476028442, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8534879684448242, + "num_tokens": 554239567.0, + "step": 14532 + }, + { + "epoch": 1.8487469787558835, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8941917419433594, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8684316277503967, + "num_tokens": 554277040.0, + "step": 14533 + }, + { + "epoch": 1.848874189034474, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.779702067375183, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8700502514839172, + "num_tokens": 554318994.0, + "step": 14534 + }, + { + "epoch": 1.8490013993130645, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9569346904754639, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8591175079345703, + "num_tokens": 554355334.0, + "step": 14535 + }, + { + "epoch": 1.849128609591655, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9284030199050903, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8540302515029907, + "num_tokens": 554396504.0, + "step": 14536 + }, + { + "epoch": 1.8492558198702456, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.220517873764038, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8708794116973877, + "num_tokens": 554429772.0, + "step": 14537 + }, + { + "epoch": 1.8493830301488359, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9093667268753052, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8663864135742188, + "num_tokens": 554470102.0, + "step": 14538 + }, + { + "epoch": 1.8495102404274264, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7922636270523071, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8548079133033752, + "num_tokens": 554511632.0, + "step": 14539 + }, + { + "epoch": 1.849637450706017, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7445075511932373, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8609150648117065, + "num_tokens": 554553028.0, + "step": 14540 + }, + { + "epoch": 1.8497646609846075, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9141019582748413, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.859409511089325, + "num_tokens": 554585109.0, + "step": 14541 + }, + { + "epoch": 1.849891871263198, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9748880863189697, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8651390075683594, + "num_tokens": 554621141.0, + "step": 14542 + }, + { + "epoch": 1.8500190815417885, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8247214555740356, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8638095855712891, + "num_tokens": 554660838.0, + "step": 14543 + }, + { + "epoch": 1.850146291820379, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.011167526245117, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.865260124206543, + "num_tokens": 554696338.0, + "step": 14544 + }, + { + "epoch": 1.8502735020989696, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0440354347229004, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8768771290779114, + "num_tokens": 554729782.0, + "step": 14545 + }, + { + "epoch": 1.85040071237756, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.09682035446167, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8799182176589966, + "num_tokens": 554767566.0, + "step": 14546 + }, + { + "epoch": 1.8505279226561506, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.974028468132019, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8682315945625305, + "num_tokens": 554806063.0, + "step": 14547 + }, + { + "epoch": 1.8506551329347412, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.94229257106781, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8618736267089844, + "num_tokens": 554839418.0, + "step": 14548 + }, + { + "epoch": 1.8507823432133317, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.91110360622406, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8602883815765381, + "num_tokens": 554881888.0, + "step": 14549 + }, + { + "epoch": 1.8509095534919222, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8450000286102295, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8824177980422974, + "num_tokens": 554921561.0, + "step": 14550 + }, + { + "epoch": 1.8510367637705127, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0926387310028076, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8404049873352051, + "num_tokens": 554954380.0, + "step": 14551 + }, + { + "epoch": 1.8511639740491033, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7904866933822632, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8661291599273682, + "num_tokens": 554994781.0, + "step": 14552 + }, + { + "epoch": 1.8512911843276938, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0233731269836426, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8719028234481812, + "num_tokens": 555024938.0, + "step": 14553 + }, + { + "epoch": 1.8514183946062843, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.004544734954834, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8530544638633728, + "num_tokens": 555059785.0, + "step": 14554 + }, + { + "epoch": 1.8515456048848749, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.136676788330078, + "learning_rate": 1e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8397071957588196, + "num_tokens": 555097502.0, + "step": 14555 + }, + { + "epoch": 1.8516728151634652, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9877328872680664, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8604695796966553, + "num_tokens": 555139976.0, + "step": 14556 + }, + { + "epoch": 1.8518000254420557, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.755693793296814, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8671842813491821, + "num_tokens": 555181250.0, + "step": 14557 + }, + { + "epoch": 1.8519272357206462, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0961270332336426, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8449520468711853, + "num_tokens": 555214927.0, + "step": 14558 + }, + { + "epoch": 1.8520544459992367, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8964718580245972, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8615637421607971, + "num_tokens": 555255795.0, + "step": 14559 + }, + { + "epoch": 1.8521816562778273, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8698874711990356, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8666157126426697, + "num_tokens": 555294177.0, + "step": 14560 + }, + { + "epoch": 1.8523088665564178, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.806528925895691, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8771137595176697, + "num_tokens": 555331548.0, + "step": 14561 + }, + { + "epoch": 1.852436076835008, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8176724910736084, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8754875659942627, + "num_tokens": 555368426.0, + "step": 14562 + }, + { + "epoch": 1.8525632871135986, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8267170190811157, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8805708885192871, + "num_tokens": 555411594.0, + "step": 14563 + }, + { + "epoch": 1.8526904973921892, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 80.52320098876953, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8569755554199219, + "num_tokens": 555450412.0, + "step": 14564 + }, + { + "epoch": 1.8528177076707797, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1970126628875732, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8730723261833191, + "num_tokens": 555489067.0, + "step": 14565 + }, + { + "epoch": 1.8529449179493702, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3212993144989014, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.869926929473877, + "num_tokens": 555524552.0, + "step": 14566 + }, + { + "epoch": 1.8530721282279607, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9265285730361938, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8387453556060791, + "num_tokens": 555563766.0, + "step": 14567 + }, + { + "epoch": 1.8531993385065513, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.934211015701294, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8719527721405029, + "num_tokens": 555601410.0, + "step": 14568 + }, + { + "epoch": 1.8533265487851418, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8447768688201904, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8630249500274658, + "num_tokens": 555641346.0, + "step": 14569 + }, + { + "epoch": 1.8534537590637323, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.961952567100525, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8738323450088501, + "num_tokens": 555676537.0, + "step": 14570 + }, + { + "epoch": 1.8535809693423229, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 16.592243194580078, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8503732681274414, + "num_tokens": 555721634.0, + "step": 14571 + }, + { + "epoch": 1.8537081796209134, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8306946754455566, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8647162914276123, + "num_tokens": 555765447.0, + "step": 14572 + }, + { + "epoch": 1.853835389899504, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2241923809051514, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8635658025741577, + "num_tokens": 555798598.0, + "step": 14573 + }, + { + "epoch": 1.8539626001780944, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0154693126678467, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8717005252838135, + "num_tokens": 555829465.0, + "step": 14574 + }, + { + "epoch": 1.854089810456685, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9387296438217163, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8653771877288818, + "num_tokens": 555866577.0, + "step": 14575 + }, + { + "epoch": 1.8542170207352755, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8723320960998535, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8673590421676636, + "num_tokens": 555900472.0, + "step": 14576 + }, + { + "epoch": 1.854344231013866, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9196245670318604, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8587612509727478, + "num_tokens": 555936356.0, + "step": 14577 + }, + { + "epoch": 1.8544714412924566, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9812666177749634, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.858110249042511, + "num_tokens": 555975338.0, + "step": 14578 + }, + { + "epoch": 1.854598651571047, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.816318154335022, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8550001978874207, + "num_tokens": 556019651.0, + "step": 14579 + }, + { + "epoch": 1.8547258618496376, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9288694858551025, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.859870433807373, + "num_tokens": 556054530.0, + "step": 14580 + }, + { + "epoch": 1.854853072128228, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8967461585998535, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8599367141723633, + "num_tokens": 556091985.0, + "step": 14581 + }, + { + "epoch": 1.8549802824068184, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0144314765930176, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.86820387840271, + "num_tokens": 556128777.0, + "step": 14582 + }, + { + "epoch": 1.855107492685409, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9365960359573364, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.883668065071106, + "num_tokens": 556164286.0, + "step": 14583 + }, + { + "epoch": 1.8552347029639995, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9081084728240967, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8727864623069763, + "num_tokens": 556199141.0, + "step": 14584 + }, + { + "epoch": 1.85536191324259, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9147915840148926, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8701984882354736, + "num_tokens": 556237104.0, + "step": 14585 + }, + { + "epoch": 1.8554891235211806, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8878947496414185, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8540952205657959, + "num_tokens": 556275343.0, + "step": 14586 + }, + { + "epoch": 1.8556163337997709, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7143999338150024, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8807094693183899, + "num_tokens": 556316539.0, + "step": 14587 + }, + { + "epoch": 1.8557435440783614, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.097048282623291, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8764278888702393, + "num_tokens": 556351101.0, + "step": 14588 + }, + { + "epoch": 1.855870754356952, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9443060159683228, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8761361241340637, + "num_tokens": 556382458.0, + "step": 14589 + }, + { + "epoch": 1.8559979646355425, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8928993940353394, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8679153323173523, + "num_tokens": 556416980.0, + "step": 14590 + }, + { + "epoch": 1.856125174914133, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.005168914794922, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8694518208503723, + "num_tokens": 556451868.0, + "step": 14591 + }, + { + "epoch": 1.8562523851927235, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9759544134140015, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8533675670623779, + "num_tokens": 556485682.0, + "step": 14592 + }, + { + "epoch": 1.856379595471314, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8347831964492798, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8761407136917114, + "num_tokens": 556524857.0, + "step": 14593 + }, + { + "epoch": 1.8565068057499046, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.94471275806427, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8693084716796875, + "num_tokens": 556565763.0, + "step": 14594 + }, + { + "epoch": 1.856634016028495, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8947021961212158, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.853702187538147, + "num_tokens": 556606202.0, + "step": 14595 + }, + { + "epoch": 1.8567612263070856, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.730757474899292, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8728469014167786, + "num_tokens": 556646906.0, + "step": 14596 + }, + { + "epoch": 1.8568884365856761, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9514319896697998, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8664551973342896, + "num_tokens": 556684249.0, + "step": 14597 + }, + { + "epoch": 1.8570156468642667, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.102947235107422, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8533727526664734, + "num_tokens": 556726840.0, + "step": 14598 + }, + { + "epoch": 1.8571428571428572, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.808640718460083, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8661195039749146, + "num_tokens": 556764822.0, + "step": 14599 + }, + { + "epoch": 1.8572700674214477, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8417037725448608, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8637264966964722, + "num_tokens": 556800884.0, + "step": 14600 + }, + { + "epoch": 1.8573972777000383, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8336784839630127, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8714701533317566, + "num_tokens": 556841233.0, + "step": 14601 + }, + { + "epoch": 1.8575244879786288, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9507890939712524, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8579584360122681, + "num_tokens": 556880995.0, + "step": 14602 + }, + { + "epoch": 1.8576516982572193, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9339548349380493, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8697845339775085, + "num_tokens": 556914367.0, + "step": 14603 + }, + { + "epoch": 1.8577789085358098, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0140576362609863, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8449923992156982, + "num_tokens": 556955944.0, + "step": 14604 + }, + { + "epoch": 1.8579061188144002, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.079651117324829, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8480930328369141, + "num_tokens": 556989851.0, + "step": 14605 + }, + { + "epoch": 1.8580333290929907, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8382582664489746, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8629496693611145, + "num_tokens": 557033662.0, + "step": 14606 + }, + { + "epoch": 1.8581605393715812, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.815437912940979, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8571537733078003, + "num_tokens": 557074869.0, + "step": 14607 + }, + { + "epoch": 1.8582877496501717, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.869476556777954, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8530203104019165, + "num_tokens": 557110164.0, + "step": 14608 + }, + { + "epoch": 1.8584149599287623, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8766300678253174, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.864227294921875, + "num_tokens": 557149309.0, + "step": 14609 + }, + { + "epoch": 1.8585421702073528, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.03957200050354, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8598506450653076, + "num_tokens": 557180510.0, + "step": 14610 + }, + { + "epoch": 1.858669380485943, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9561229944229126, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8688842058181763, + "num_tokens": 557218062.0, + "step": 14611 + }, + { + "epoch": 1.8587965907645336, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8707809448242188, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8563652634620667, + "num_tokens": 557258963.0, + "step": 14612 + }, + { + "epoch": 1.8589238010431242, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9156075716018677, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8656893372535706, + "num_tokens": 557296046.0, + "step": 14613 + }, + { + "epoch": 1.8590510113217147, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.282233953475952, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8632091283798218, + "num_tokens": 557328395.0, + "step": 14614 + }, + { + "epoch": 1.8591782216003052, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8849270343780518, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8680117726325989, + "num_tokens": 557363888.0, + "step": 14615 + }, + { + "epoch": 1.8593054318788957, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9194122552871704, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8564911484718323, + "num_tokens": 557402944.0, + "step": 14616 + }, + { + "epoch": 1.8594326421574863, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.938207745552063, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8703210949897766, + "num_tokens": 557442247.0, + "step": 14617 + }, + { + "epoch": 1.8595598524360768, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7753936052322388, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8572734594345093, + "num_tokens": 557483580.0, + "step": 14618 + }, + { + "epoch": 1.8596870627146673, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7269889116287231, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8678870797157288, + "num_tokens": 557520134.0, + "step": 14619 + }, + { + "epoch": 1.8598142729932579, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7308719158172607, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8589137196540833, + "num_tokens": 557565726.0, + "step": 14620 + }, + { + "epoch": 1.8599414832718484, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8457545042037964, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8618143796920776, + "num_tokens": 557602871.0, + "step": 14621 + }, + { + "epoch": 1.860068693550439, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7834444046020508, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8568158149719238, + "num_tokens": 557640873.0, + "step": 14622 + }, + { + "epoch": 1.8601959038290294, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.00830078125, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8587926626205444, + "num_tokens": 557684233.0, + "step": 14623 + }, + { + "epoch": 1.86032311410762, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0282604694366455, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8698325157165527, + "num_tokens": 557716895.0, + "step": 14624 + }, + { + "epoch": 1.8604503243862105, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.24650239944458, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8616491556167603, + "num_tokens": 557758197.0, + "step": 14625 + }, + { + "epoch": 1.860577534664801, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1575217247009277, + "learning_rate": 1e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8366490602493286, + "num_tokens": 557794196.0, + "step": 14626 + }, + { + "epoch": 1.8607047449433916, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8586108684539795, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8473955988883972, + "num_tokens": 557833856.0, + "step": 14627 + }, + { + "epoch": 1.860831955221982, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1722872257232666, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8510051965713501, + "num_tokens": 557866304.0, + "step": 14628 + }, + { + "epoch": 1.8609591655005726, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8111859560012817, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.876418948173523, + "num_tokens": 557907065.0, + "step": 14629 + }, + { + "epoch": 1.861086375779163, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.770989179611206, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8601062297821045, + "num_tokens": 557947326.0, + "step": 14630 + }, + { + "epoch": 1.8612135860577534, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8418772220611572, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8631812334060669, + "num_tokens": 557990059.0, + "step": 14631 + }, + { + "epoch": 1.861340796336344, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.915274739265442, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8656415939331055, + "num_tokens": 558028406.0, + "step": 14632 + }, + { + "epoch": 1.8614680066149345, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8090288639068604, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8607903718948364, + "num_tokens": 558070239.0, + "step": 14633 + }, + { + "epoch": 1.861595216893525, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7804657220840454, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8680452704429626, + "num_tokens": 558107685.0, + "step": 14634 + }, + { + "epoch": 1.8617224271721156, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8108773231506348, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8579839468002319, + "num_tokens": 558143185.0, + "step": 14635 + }, + { + "epoch": 1.8618496374507059, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8811663389205933, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.872870147228241, + "num_tokens": 558176207.0, + "step": 14636 + }, + { + "epoch": 1.8619768477292964, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1610701084136963, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8793536424636841, + "num_tokens": 558212215.0, + "step": 14637 + }, + { + "epoch": 1.862104058007887, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8925960063934326, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8724453449249268, + "num_tokens": 558248471.0, + "step": 14638 + }, + { + "epoch": 1.8622312682864774, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9237028360366821, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8505085706710815, + "num_tokens": 558285211.0, + "step": 14639 + }, + { + "epoch": 1.862358478565068, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3766183853149414, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8571481704711914, + "num_tokens": 558314434.0, + "step": 14640 + }, + { + "epoch": 1.8624856888436585, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.036168336868286, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8718349933624268, + "num_tokens": 558353710.0, + "step": 14641 + }, + { + "epoch": 1.862612899122249, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9461886882781982, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8518354892730713, + "num_tokens": 558390476.0, + "step": 14642 + }, + { + "epoch": 1.8627401094008396, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8824512958526611, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8727917075157166, + "num_tokens": 558424917.0, + "step": 14643 + }, + { + "epoch": 1.86286731967943, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9856573343276978, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8571709394454956, + "num_tokens": 558459831.0, + "step": 14644 + }, + { + "epoch": 1.8629945299580206, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2510037422180176, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8791985511779785, + "num_tokens": 558498600.0, + "step": 14645 + }, + { + "epoch": 1.8631217402366111, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0195114612579346, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8548510670661926, + "num_tokens": 558537465.0, + "step": 14646 + }, + { + "epoch": 1.8632489505152017, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8190544843673706, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8838801383972168, + "num_tokens": 558575357.0, + "step": 14647 + }, + { + "epoch": 1.8633761607937922, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7602592706680298, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8610508441925049, + "num_tokens": 558619422.0, + "step": 14648 + }, + { + "epoch": 1.8635033710723827, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8458973169326782, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8549854755401611, + "num_tokens": 558657677.0, + "step": 14649 + }, + { + "epoch": 1.8636305813509733, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.919836163520813, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8722468614578247, + "num_tokens": 558692912.0, + "step": 14650 + }, + { + "epoch": 1.8637577916295638, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.717011570930481, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8759305477142334, + "num_tokens": 558733470.0, + "step": 14651 + }, + { + "epoch": 1.8638850019081543, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0625193119049072, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.860069751739502, + "num_tokens": 558771899.0, + "step": 14652 + }, + { + "epoch": 1.8640122121867448, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.016512870788574, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8689329624176025, + "num_tokens": 558813002.0, + "step": 14653 + }, + { + "epoch": 1.8641394224653351, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0019538402557373, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8486804962158203, + "num_tokens": 558850827.0, + "step": 14654 + }, + { + "epoch": 1.8642666327439257, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.665450096130371, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.860160231590271, + "num_tokens": 558889003.0, + "step": 14655 + }, + { + "epoch": 1.8643938430225162, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.077813148498535, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8703028559684753, + "num_tokens": 558925849.0, + "step": 14656 + }, + { + "epoch": 1.8645210533011067, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9783382415771484, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8629903197288513, + "num_tokens": 558967471.0, + "step": 14657 + }, + { + "epoch": 1.8646482635796973, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9943557977676392, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8672237396240234, + "num_tokens": 559001938.0, + "step": 14658 + }, + { + "epoch": 1.8647754738582878, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2630326747894287, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8709161281585693, + "num_tokens": 559040515.0, + "step": 14659 + }, + { + "epoch": 1.864902684136878, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.132845163345337, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8775959014892578, + "num_tokens": 559077702.0, + "step": 14660 + }, + { + "epoch": 1.8650298944154686, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0763232707977295, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8565076589584351, + "num_tokens": 559114741.0, + "step": 14661 + }, + { + "epoch": 1.8651571046940592, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0653786659240723, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8416202664375305, + "num_tokens": 559155844.0, + "step": 14662 + }, + { + "epoch": 1.8652843149726497, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9367430210113525, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.866310715675354, + "num_tokens": 559194964.0, + "step": 14663 + }, + { + "epoch": 1.8654115252512402, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.983028531074524, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8705220222473145, + "num_tokens": 559230083.0, + "step": 14664 + }, + { + "epoch": 1.8655387355298307, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.915244698524475, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8760586977005005, + "num_tokens": 559268397.0, + "step": 14665 + }, + { + "epoch": 1.8656659458084213, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.943913221359253, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8555211424827576, + "num_tokens": 559302024.0, + "step": 14666 + }, + { + "epoch": 1.8657931560870118, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9886618852615356, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8615365624427795, + "num_tokens": 559343192.0, + "step": 14667 + }, + { + "epoch": 1.8659203663656023, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8063710927963257, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8871117234230042, + "num_tokens": 559380616.0, + "step": 14668 + }, + { + "epoch": 1.8660475766441929, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.553201913833618, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8748356103897095, + "num_tokens": 559413621.0, + "step": 14669 + }, + { + "epoch": 1.8661747869227834, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.836612343788147, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8735376000404358, + "num_tokens": 559447274.0, + "step": 14670 + }, + { + "epoch": 1.866301997201374, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.881629228591919, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8741136789321899, + "num_tokens": 559484306.0, + "step": 14671 + }, + { + "epoch": 1.8664292074799644, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1683359146118164, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8711334466934204, + "num_tokens": 559527126.0, + "step": 14672 + }, + { + "epoch": 1.866556417758555, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7750482559204102, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8686212301254272, + "num_tokens": 559568342.0, + "step": 14673 + }, + { + "epoch": 1.8666836280371455, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.923262357711792, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.861406683921814, + "num_tokens": 559609597.0, + "step": 14674 + }, + { + "epoch": 1.866810838315736, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.95186185836792, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8631343841552734, + "num_tokens": 559646392.0, + "step": 14675 + }, + { + "epoch": 1.8669380485943265, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3781657218933105, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8668949604034424, + "num_tokens": 559684836.0, + "step": 14676 + }, + { + "epoch": 1.867065258872917, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9413871765136719, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8682866096496582, + "num_tokens": 559727793.0, + "step": 14677 + }, + { + "epoch": 1.8671924691515076, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0128676891326904, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8535889387130737, + "num_tokens": 559768154.0, + "step": 14678 + }, + { + "epoch": 1.867319679430098, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.033234119415283, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.865260124206543, + "num_tokens": 559798732.0, + "step": 14679 + }, + { + "epoch": 1.8674468897086884, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7982444763183594, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8647238612174988, + "num_tokens": 559842796.0, + "step": 14680 + }, + { + "epoch": 1.867574099987279, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.858640432357788, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8618223667144775, + "num_tokens": 559884198.0, + "step": 14681 + }, + { + "epoch": 1.8677013102658695, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0303287506103516, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8500690460205078, + "num_tokens": 559916749.0, + "step": 14682 + }, + { + "epoch": 1.86782852054446, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.825585961341858, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.859664261341095, + "num_tokens": 559958430.0, + "step": 14683 + }, + { + "epoch": 1.8679557308230506, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8136016130447388, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8816825151443481, + "num_tokens": 559995720.0, + "step": 14684 + }, + { + "epoch": 1.8680829411016409, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8432646989822388, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8683258295059204, + "num_tokens": 560033304.0, + "step": 14685 + }, + { + "epoch": 1.8682101513802314, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9245117902755737, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8630249500274658, + "num_tokens": 560069879.0, + "step": 14686 + }, + { + "epoch": 1.868337361658822, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9449622631072998, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8627578020095825, + "num_tokens": 560106853.0, + "step": 14687 + }, + { + "epoch": 1.8684645719374124, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9353597164154053, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.865530788898468, + "num_tokens": 560144225.0, + "step": 14688 + }, + { + "epoch": 1.868591782216003, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9330235719680786, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8579908013343811, + "num_tokens": 560186520.0, + "step": 14689 + }, + { + "epoch": 1.8687189924945935, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9204092025756836, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8691142797470093, + "num_tokens": 560225458.0, + "step": 14690 + }, + { + "epoch": 1.868846202773184, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9037742614746094, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8702970147132874, + "num_tokens": 560263401.0, + "step": 14691 + }, + { + "epoch": 1.8689734130517746, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8531254529953003, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8619665503501892, + "num_tokens": 560305026.0, + "step": 14692 + }, + { + "epoch": 1.869100623330365, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8641269207000732, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8330396413803101, + "num_tokens": 560354823.0, + "step": 14693 + }, + { + "epoch": 1.8692278336089556, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7978873252868652, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8805726766586304, + "num_tokens": 560391888.0, + "step": 14694 + }, + { + "epoch": 1.8693550438875461, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9141770601272583, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8681392669677734, + "num_tokens": 560425350.0, + "step": 14695 + }, + { + "epoch": 1.8694822541661367, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9533966779708862, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8748950958251953, + "num_tokens": 560458521.0, + "step": 14696 + }, + { + "epoch": 1.8696094644447272, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7595345973968506, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8590275049209595, + "num_tokens": 560506378.0, + "step": 14697 + }, + { + "epoch": 1.8697366747233177, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0158724784851074, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8867653012275696, + "num_tokens": 560544055.0, + "step": 14698 + }, + { + "epoch": 1.8698638850019083, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3788459300994873, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8755410313606262, + "num_tokens": 560577240.0, + "step": 14699 + }, + { + "epoch": 1.8699910952804988, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.110640287399292, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8587543964385986, + "num_tokens": 560615662.0, + "step": 14700 + }, + { + "epoch": 1.8701183055590893, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8645557165145874, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8597019910812378, + "num_tokens": 560658386.0, + "step": 14701 + }, + { + "epoch": 1.8702455158376798, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7779154777526855, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8631370067596436, + "num_tokens": 560696366.0, + "step": 14702 + }, + { + "epoch": 1.8703727261162701, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9781558513641357, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8528962731361389, + "num_tokens": 560736419.0, + "step": 14703 + }, + { + "epoch": 1.8704999363948607, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9411872625350952, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8638299703598022, + "num_tokens": 560777839.0, + "step": 14704 + }, + { + "epoch": 1.8706271466734512, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8204314708709717, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8747063875198364, + "num_tokens": 560816167.0, + "step": 14705 + }, + { + "epoch": 1.8707543569520417, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8967139720916748, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8506002426147461, + "num_tokens": 560854524.0, + "step": 14706 + }, + { + "epoch": 1.8708815672306323, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8298174142837524, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8614611029624939, + "num_tokens": 560894393.0, + "step": 14707 + }, + { + "epoch": 1.8710087775092228, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7360553741455078, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8750818967819214, + "num_tokens": 560938640.0, + "step": 14708 + }, + { + "epoch": 1.871135987787813, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9434572458267212, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8647003769874573, + "num_tokens": 560976805.0, + "step": 14709 + }, + { + "epoch": 1.8712631980664036, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7984651327133179, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8604155778884888, + "num_tokens": 561018940.0, + "step": 14710 + }, + { + "epoch": 1.8713904083449941, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.850687861442566, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8684505224227905, + "num_tokens": 561065228.0, + "step": 14711 + }, + { + "epoch": 1.8715176186235847, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8856596946716309, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8563681840896606, + "num_tokens": 561107540.0, + "step": 14712 + }, + { + "epoch": 1.8716448289021752, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2744288444519043, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8440371751785278, + "num_tokens": 561144774.0, + "step": 14713 + }, + { + "epoch": 1.8717720391807657, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8716282844543457, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8454049825668335, + "num_tokens": 561186774.0, + "step": 14714 + }, + { + "epoch": 1.8718992494593563, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.04866099357605, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8770309686660767, + "num_tokens": 561223997.0, + "step": 14715 + }, + { + "epoch": 1.8720264597379468, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8934539556503296, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.871620774269104, + "num_tokens": 561261583.0, + "step": 14716 + }, + { + "epoch": 1.8721536700165373, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.863634705543518, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8692489862442017, + "num_tokens": 561302681.0, + "step": 14717 + }, + { + "epoch": 1.8722808802951278, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0027174949645996, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8621805906295776, + "num_tokens": 561338402.0, + "step": 14718 + }, + { + "epoch": 1.8724080905737184, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3476953506469727, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8695306777954102, + "num_tokens": 561372892.0, + "step": 14719 + }, + { + "epoch": 1.872535300852309, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.953086256980896, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8613951802253723, + "num_tokens": 561409474.0, + "step": 14720 + }, + { + "epoch": 1.8726625111308994, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9550434350967407, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8379988670349121, + "num_tokens": 561450152.0, + "step": 14721 + }, + { + "epoch": 1.87278972140949, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9176090955734253, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8581123948097229, + "num_tokens": 561489448.0, + "step": 14722 + }, + { + "epoch": 1.8729169316880805, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9176100492477417, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8408071994781494, + "num_tokens": 561529684.0, + "step": 14723 + }, + { + "epoch": 1.873044141966671, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0025882720947266, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8662035465240479, + "num_tokens": 561564259.0, + "step": 14724 + }, + { + "epoch": 1.8731713522452615, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.021153450012207, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8713120818138123, + "num_tokens": 561609650.0, + "step": 14725 + }, + { + "epoch": 1.873298562523852, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8924065828323364, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.870445191860199, + "num_tokens": 561645520.0, + "step": 14726 + }, + { + "epoch": 1.8734257728024426, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8174550533294678, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8700124025344849, + "num_tokens": 561682771.0, + "step": 14727 + }, + { + "epoch": 1.873552983081033, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.889148473739624, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8547691106796265, + "num_tokens": 561721317.0, + "step": 14728 + }, + { + "epoch": 1.8736801933596234, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0098416805267334, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8660554885864258, + "num_tokens": 561763178.0, + "step": 14729 + }, + { + "epoch": 1.873807403638214, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8547508716583252, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8657454252243042, + "num_tokens": 561802292.0, + "step": 14730 + }, + { + "epoch": 1.8739346139168045, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.824913740158081, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8740737438201904, + "num_tokens": 561838610.0, + "step": 14731 + }, + { + "epoch": 1.874061824195395, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9037463665008545, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8678812980651855, + "num_tokens": 561881992.0, + "step": 14732 + }, + { + "epoch": 1.8741890344739855, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.076444149017334, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8545302152633667, + "num_tokens": 561917661.0, + "step": 14733 + }, + { + "epoch": 1.8743162447525759, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.812812328338623, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.865694522857666, + "num_tokens": 561958218.0, + "step": 14734 + }, + { + "epoch": 1.8744434550311664, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8683258295059204, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8757843971252441, + "num_tokens": 561993882.0, + "step": 14735 + }, + { + "epoch": 1.874570665309757, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7449313402175903, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8656122088432312, + "num_tokens": 562035443.0, + "step": 14736 + }, + { + "epoch": 1.8746978755883474, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.892892599105835, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8624568581581116, + "num_tokens": 562072806.0, + "step": 14737 + }, + { + "epoch": 1.874825085866938, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8726388216018677, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8468077182769775, + "num_tokens": 562113652.0, + "step": 14738 + }, + { + "epoch": 1.8749522961455285, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.886825442314148, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8553104400634766, + "num_tokens": 562150600.0, + "step": 14739 + }, + { + "epoch": 1.875079506424119, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0355384349823, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8660562038421631, + "num_tokens": 562187730.0, + "step": 14740 + }, + { + "epoch": 1.8752067167027096, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9045833349227905, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.865017831325531, + "num_tokens": 562228229.0, + "step": 14741 + }, + { + "epoch": 1.8753339269813, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.025925397872925, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8706615567207336, + "num_tokens": 562262293.0, + "step": 14742 + }, + { + "epoch": 1.8754611372598906, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.84529709815979, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8623440265655518, + "num_tokens": 562303389.0, + "step": 14743 + }, + { + "epoch": 1.8755883475384811, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8875764608383179, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8576527237892151, + "num_tokens": 562344461.0, + "step": 14744 + }, + { + "epoch": 1.8757155578170717, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8655529022216797, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8476362824440002, + "num_tokens": 562387068.0, + "step": 14745 + }, + { + "epoch": 1.8758427680956622, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8015077114105225, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8623003959655762, + "num_tokens": 562428225.0, + "step": 14746 + }, + { + "epoch": 1.8759699783742527, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9297475814819336, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8644381165504456, + "num_tokens": 562463879.0, + "step": 14747 + }, + { + "epoch": 1.8760971886528433, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0812013149261475, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8570364713668823, + "num_tokens": 562502327.0, + "step": 14748 + }, + { + "epoch": 1.8762243989314338, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.01347279548645, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.867800772190094, + "num_tokens": 562542162.0, + "step": 14749 + }, + { + "epoch": 1.8763516092100243, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3004531860351562, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8585741519927979, + "num_tokens": 562583364.0, + "step": 14750 + }, + { + "epoch": 1.8764788194886148, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9953655004501343, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8663937449455261, + "num_tokens": 562615870.0, + "step": 14751 + }, + { + "epoch": 1.8766060297672051, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9604541063308716, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8488359451293945, + "num_tokens": 562655086.0, + "step": 14752 + }, + { + "epoch": 1.8767332400457957, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7772387266159058, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8570652008056641, + "num_tokens": 562697532.0, + "step": 14753 + }, + { + "epoch": 1.8768604503243862, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.080122947692871, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8436518311500549, + "num_tokens": 562731057.0, + "step": 14754 + }, + { + "epoch": 1.8769876606029767, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8524290323257446, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8521115183830261, + "num_tokens": 562770402.0, + "step": 14755 + }, + { + "epoch": 1.8771148708815673, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9440336227416992, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8594372272491455, + "num_tokens": 562806191.0, + "step": 14756 + }, + { + "epoch": 1.8772420811601578, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8959020376205444, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8631086349487305, + "num_tokens": 562846864.0, + "step": 14757 + }, + { + "epoch": 1.877369291438748, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9144474267959595, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8578394651412964, + "num_tokens": 562882887.0, + "step": 14758 + }, + { + "epoch": 1.8774965017173386, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.809116244316101, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.862962007522583, + "num_tokens": 562919096.0, + "step": 14759 + }, + { + "epoch": 1.8776237119959291, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0073623657226562, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8378198146820068, + "num_tokens": 562959336.0, + "step": 14760 + }, + { + "epoch": 1.8777509222745197, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8440192937850952, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8637412190437317, + "num_tokens": 562997883.0, + "step": 14761 + }, + { + "epoch": 1.8778781325531102, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7642217874526978, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8742905259132385, + "num_tokens": 563036763.0, + "step": 14762 + }, + { + "epoch": 1.8780053428317007, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8813260793685913, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8801822662353516, + "num_tokens": 563076815.0, + "step": 14763 + }, + { + "epoch": 1.8781325531102913, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9403008222579956, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8745607137680054, + "num_tokens": 563110511.0, + "step": 14764 + }, + { + "epoch": 1.8782597633888818, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9633219242095947, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8589726090431213, + "num_tokens": 563140757.0, + "step": 14765 + }, + { + "epoch": 1.8783869736674723, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7770402431488037, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8633072376251221, + "num_tokens": 563179961.0, + "step": 14766 + }, + { + "epoch": 1.8785141839460628, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.769505500793457, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8697018027305603, + "num_tokens": 563221355.0, + "step": 14767 + }, + { + "epoch": 1.8786413942246534, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7749249935150146, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8855173587799072, + "num_tokens": 563257427.0, + "step": 14768 + }, + { + "epoch": 1.878768604503244, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.899138331413269, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8614387512207031, + "num_tokens": 563294371.0, + "step": 14769 + }, + { + "epoch": 1.8788958147818344, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.817047119140625, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8737057447433472, + "num_tokens": 563338592.0, + "step": 14770 + }, + { + "epoch": 1.879023025060425, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.963998556137085, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8676580190658569, + "num_tokens": 563373853.0, + "step": 14771 + }, + { + "epoch": 1.8791502353390155, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.929187297821045, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8629043102264404, + "num_tokens": 563411137.0, + "step": 14772 + }, + { + "epoch": 1.879277445617606, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8084068298339844, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8690853118896484, + "num_tokens": 563450294.0, + "step": 14773 + }, + { + "epoch": 1.8794046558961965, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7606192827224731, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8654868602752686, + "num_tokens": 563496858.0, + "step": 14774 + }, + { + "epoch": 1.879531866174787, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.838820457458496, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8599400520324707, + "num_tokens": 563539566.0, + "step": 14775 + }, + { + "epoch": 1.8796590764533776, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.219510555267334, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8710086345672607, + "num_tokens": 563574363.0, + "step": 14776 + }, + { + "epoch": 1.879786286731968, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0868630409240723, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8653019666671753, + "num_tokens": 563605540.0, + "step": 14777 + }, + { + "epoch": 1.8799134970105584, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.443589687347412, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8497797250747681, + "num_tokens": 563646640.0, + "step": 14778 + }, + { + "epoch": 1.880040707289149, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8190622329711914, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8749906420707703, + "num_tokens": 563683576.0, + "step": 14779 + }, + { + "epoch": 1.8801679175677395, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8670449256896973, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8574509620666504, + "num_tokens": 563724088.0, + "step": 14780 + }, + { + "epoch": 1.88029512784633, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8786892890930176, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8631964921951294, + "num_tokens": 563760897.0, + "step": 14781 + }, + { + "epoch": 1.8804223381249205, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8120304346084595, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.868228554725647, + "num_tokens": 563800158.0, + "step": 14782 + }, + { + "epoch": 1.8805495484035109, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8617291450500488, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8714524507522583, + "num_tokens": 563833288.0, + "step": 14783 + }, + { + "epoch": 1.8806767586821014, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.730216145515442, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8753604292869568, + "num_tokens": 563874507.0, + "step": 14784 + }, + { + "epoch": 1.880803968960692, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8457560539245605, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.864840030670166, + "num_tokens": 563912452.0, + "step": 14785 + }, + { + "epoch": 1.8809311792392824, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9266941547393799, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.846973180770874, + "num_tokens": 563950565.0, + "step": 14786 + }, + { + "epoch": 1.881058389517873, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9864970445632935, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8551161885261536, + "num_tokens": 563984924.0, + "step": 14787 + }, + { + "epoch": 1.8811855997964635, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7808810472488403, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8719816207885742, + "num_tokens": 564022887.0, + "step": 14788 + }, + { + "epoch": 1.881312810075054, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.747876763343811, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8632776737213135, + "num_tokens": 564064829.0, + "step": 14789 + }, + { + "epoch": 1.8814400203536445, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0744755268096924, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8427888751029968, + "num_tokens": 564100858.0, + "step": 14790 + }, + { + "epoch": 1.881567230632235, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.892586588859558, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8568787574768066, + "num_tokens": 564141477.0, + "step": 14791 + }, + { + "epoch": 1.8816944409108256, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8641477823257446, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8572877645492554, + "num_tokens": 564179271.0, + "step": 14792 + }, + { + "epoch": 1.8818216511894161, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.943035364151001, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8702907562255859, + "num_tokens": 564224336.0, + "step": 14793 + }, + { + "epoch": 1.8819488614680067, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8380581140518188, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8591183423995972, + "num_tokens": 564265157.0, + "step": 14794 + }, + { + "epoch": 1.8820760717465972, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.003638744354248, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8693639039993286, + "num_tokens": 564295023.0, + "step": 14795 + }, + { + "epoch": 1.8822032820251877, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7973345518112183, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8660799264907837, + "num_tokens": 564333700.0, + "step": 14796 + }, + { + "epoch": 1.8823304923037782, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9592456817626953, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8499454259872437, + "num_tokens": 564374342.0, + "step": 14797 + }, + { + "epoch": 1.8824577025823688, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.6453166007995605, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8654885292053223, + "num_tokens": 564422877.0, + "step": 14798 + }, + { + "epoch": 1.8825849128609593, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8238331079483032, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8645803332328796, + "num_tokens": 564464713.0, + "step": 14799 + }, + { + "epoch": 1.8827121231395498, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0666866302490234, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8537436127662659, + "num_tokens": 564500667.0, + "step": 14800 + }, + { + "epoch": 1.8828393334181401, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7818629741668701, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8729139566421509, + "num_tokens": 564541460.0, + "step": 14801 + }, + { + "epoch": 1.8829665436967307, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8166029453277588, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8548702001571655, + "num_tokens": 564580983.0, + "step": 14802 + }, + { + "epoch": 1.8830937539753212, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8576087951660156, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8733178377151489, + "num_tokens": 564615967.0, + "step": 14803 + }, + { + "epoch": 1.8832209642539117, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8596585988998413, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8553717136383057, + "num_tokens": 564653255.0, + "step": 14804 + }, + { + "epoch": 1.8833481745325023, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.017493486404419, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8599072694778442, + "num_tokens": 564688420.0, + "step": 14805 + }, + { + "epoch": 1.8834753848110928, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6929389238357544, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8566951751708984, + "num_tokens": 564733123.0, + "step": 14806 + }, + { + "epoch": 1.883602595089683, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.783225417137146, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.872043251991272, + "num_tokens": 564767712.0, + "step": 14807 + }, + { + "epoch": 1.8837298053682736, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.968629002571106, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8610872626304626, + "num_tokens": 564810663.0, + "step": 14808 + }, + { + "epoch": 1.8838570156468641, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.941440463066101, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.861205518245697, + "num_tokens": 564848689.0, + "step": 14809 + }, + { + "epoch": 1.8839842259254547, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8078705072402954, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8507229685783386, + "num_tokens": 564887426.0, + "step": 14810 + }, + { + "epoch": 1.8841114362040452, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9335259199142456, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8673186302185059, + "num_tokens": 564924852.0, + "step": 14811 + }, + { + "epoch": 1.8842386464826357, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8160779476165771, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8592361807823181, + "num_tokens": 564964201.0, + "step": 14812 + }, + { + "epoch": 1.8843658567612263, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.754294991493225, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8755631446838379, + "num_tokens": 565004939.0, + "step": 14813 + }, + { + "epoch": 1.8844930670398168, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1317620277404785, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8679932951927185, + "num_tokens": 565044067.0, + "step": 14814 + }, + { + "epoch": 1.8846202773184073, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8502411842346191, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8663139343261719, + "num_tokens": 565082114.0, + "step": 14815 + }, + { + "epoch": 1.8847474875969978, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9215816259384155, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8639201521873474, + "num_tokens": 565118350.0, + "step": 14816 + }, + { + "epoch": 1.8848746978755884, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.909601092338562, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8562933206558228, + "num_tokens": 565153313.0, + "step": 14817 + }, + { + "epoch": 1.885001908154179, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8129104375839233, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8528473377227783, + "num_tokens": 565191332.0, + "step": 14818 + }, + { + "epoch": 1.8851291184327694, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8147708177566528, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8597512245178223, + "num_tokens": 565229937.0, + "step": 14819 + }, + { + "epoch": 1.88525632871136, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8320268392562866, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8582923412322998, + "num_tokens": 565266271.0, + "step": 14820 + }, + { + "epoch": 1.8853835389899505, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0932464599609375, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8614563941955566, + "num_tokens": 565300593.0, + "step": 14821 + }, + { + "epoch": 1.885510749268541, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8183990716934204, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8631685972213745, + "num_tokens": 565341801.0, + "step": 14822 + }, + { + "epoch": 1.8856379595471315, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9399195909500122, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8813141584396362, + "num_tokens": 565374380.0, + "step": 14823 + }, + { + "epoch": 1.885765169825722, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9457030296325684, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8564981818199158, + "num_tokens": 565413096.0, + "step": 14824 + }, + { + "epoch": 1.8858923801043126, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8982266187667847, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8735431432723999, + "num_tokens": 565453292.0, + "step": 14825 + }, + { + "epoch": 1.886019590382903, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.009161949157715, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8573684692382812, + "num_tokens": 565489311.0, + "step": 14826 + }, + { + "epoch": 1.8861468006614934, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9551712274551392, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8469754457473755, + "num_tokens": 565526293.0, + "step": 14827 + }, + { + "epoch": 1.886274010940084, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8279249668121338, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8559128642082214, + "num_tokens": 565569215.0, + "step": 14828 + }, + { + "epoch": 1.8864012212186745, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8214683532714844, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8729891180992126, + "num_tokens": 565610755.0, + "step": 14829 + }, + { + "epoch": 1.886528431497265, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0394883155822754, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8687577247619629, + "num_tokens": 565641964.0, + "step": 14830 + }, + { + "epoch": 1.8866556417758555, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0703327655792236, + "learning_rate": 1e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8296238780021667, + "num_tokens": 565682852.0, + "step": 14831 + }, + { + "epoch": 1.8867828520544458, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.119260549545288, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.879143476486206, + "num_tokens": 565721352.0, + "step": 14832 + }, + { + "epoch": 1.8869100623330364, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0130820274353027, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8686491250991821, + "num_tokens": 565764045.0, + "step": 14833 + }, + { + "epoch": 1.887037272611627, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9676402807235718, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.872330904006958, + "num_tokens": 565797617.0, + "step": 14834 + }, + { + "epoch": 1.8871644828902174, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 9.563138008117676, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8624197840690613, + "num_tokens": 565839254.0, + "step": 14835 + }, + { + "epoch": 1.887291693168808, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2434446811676025, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8640468120574951, + "num_tokens": 565874742.0, + "step": 14836 + }, + { + "epoch": 1.8874189034473985, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1037468910217285, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8631604909896851, + "num_tokens": 565911043.0, + "step": 14837 + }, + { + "epoch": 1.887546113725989, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.07737398147583, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8583507537841797, + "num_tokens": 565950228.0, + "step": 14838 + }, + { + "epoch": 1.8876733240045795, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.778590202331543, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8757138252258301, + "num_tokens": 565982891.0, + "step": 14839 + }, + { + "epoch": 1.88780053428317, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8173748254776, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8614076375961304, + "num_tokens": 566022601.0, + "step": 14840 + }, + { + "epoch": 1.8879277445617606, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0183568000793457, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8653842806816101, + "num_tokens": 566054928.0, + "step": 14841 + }, + { + "epoch": 1.8880549548403511, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.60429573059082, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8573580980300903, + "num_tokens": 566094900.0, + "step": 14842 + }, + { + "epoch": 1.8881821651189417, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4532275199890137, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.861839771270752, + "num_tokens": 566125230.0, + "step": 14843 + }, + { + "epoch": 1.8883093753975322, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.014364242553711, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8782050609588623, + "num_tokens": 566165320.0, + "step": 14844 + }, + { + "epoch": 1.8884365856761227, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.095031499862671, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8531637787818909, + "num_tokens": 566202769.0, + "step": 14845 + }, + { + "epoch": 1.8885637959547132, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0011210441589355, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8662053346633911, + "num_tokens": 566245874.0, + "step": 14846 + }, + { + "epoch": 1.8886910062333038, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7611943483352661, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8454945087432861, + "num_tokens": 566284855.0, + "step": 14847 + }, + { + "epoch": 1.8888182165118943, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0326027870178223, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8551965355873108, + "num_tokens": 566323422.0, + "step": 14848 + }, + { + "epoch": 1.8889454267904848, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9318547248840332, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8723502159118652, + "num_tokens": 566364357.0, + "step": 14849 + }, + { + "epoch": 1.8890726370690751, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0075089931488037, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8607247471809387, + "num_tokens": 566404498.0, + "step": 14850 + }, + { + "epoch": 1.8891998473476657, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9227906465530396, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8595713376998901, + "num_tokens": 566442960.0, + "step": 14851 + }, + { + "epoch": 1.8893270576262562, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0095150470733643, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8737510442733765, + "num_tokens": 566475087.0, + "step": 14852 + }, + { + "epoch": 1.8894542679048467, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7921651601791382, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8538426756858826, + "num_tokens": 566514337.0, + "step": 14853 + }, + { + "epoch": 1.8895814781834372, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8902549743652344, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8519642353057861, + "num_tokens": 566553204.0, + "step": 14854 + }, + { + "epoch": 1.8897086884620278, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8526160717010498, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8612745404243469, + "num_tokens": 566589697.0, + "step": 14855 + }, + { + "epoch": 1.889835898740618, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8591172695159912, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.850017786026001, + "num_tokens": 566629680.0, + "step": 14856 + }, + { + "epoch": 1.8899631090192086, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9591647386550903, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8860369324684143, + "num_tokens": 566670931.0, + "step": 14857 + }, + { + "epoch": 1.8900903192977991, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8781012296676636, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.856451153755188, + "num_tokens": 566706673.0, + "step": 14858 + }, + { + "epoch": 1.8902175295763897, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7480924129486084, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8598119616508484, + "num_tokens": 566749503.0, + "step": 14859 + }, + { + "epoch": 1.8903447398549802, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6991586685180664, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.877609372138977, + "num_tokens": 566790700.0, + "step": 14860 + }, + { + "epoch": 1.8904719501335707, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8153201341629028, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8636444807052612, + "num_tokens": 566832995.0, + "step": 14861 + }, + { + "epoch": 1.8905991604121613, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.010862112045288, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8515455722808838, + "num_tokens": 566868190.0, + "step": 14862 + }, + { + "epoch": 1.8907263706907518, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0499043464660645, + "learning_rate": 1e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8342384696006775, + "num_tokens": 566906637.0, + "step": 14863 + }, + { + "epoch": 1.8908535809693423, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8518949747085571, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8639650940895081, + "num_tokens": 566946006.0, + "step": 14864 + }, + { + "epoch": 1.8909807912479328, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8899519443511963, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8517654538154602, + "num_tokens": 566987726.0, + "step": 14865 + }, + { + "epoch": 1.8911080015265234, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8542299270629883, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8787106275558472, + "num_tokens": 567027432.0, + "step": 14866 + }, + { + "epoch": 1.891235211805114, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1200385093688965, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8694615960121155, + "num_tokens": 567060607.0, + "step": 14867 + }, + { + "epoch": 1.8913624220837044, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8628453016281128, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8567530512809753, + "num_tokens": 567100691.0, + "step": 14868 + }, + { + "epoch": 1.891489632362295, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6522136926651, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.881711483001709, + "num_tokens": 567139460.0, + "step": 14869 + }, + { + "epoch": 1.8916168426408855, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7366385459899902, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8523038625717163, + "num_tokens": 567185384.0, + "step": 14870 + }, + { + "epoch": 1.891744052919476, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9607867002487183, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8610612154006958, + "num_tokens": 567219898.0, + "step": 14871 + }, + { + "epoch": 1.8918712631980665, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7911745309829712, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8702039122581482, + "num_tokens": 567260322.0, + "step": 14872 + }, + { + "epoch": 1.891998473476657, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9732158184051514, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8585183620452881, + "num_tokens": 567293739.0, + "step": 14873 + }, + { + "epoch": 1.8921256837552476, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.960022211074829, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.865209698677063, + "num_tokens": 567327282.0, + "step": 14874 + }, + { + "epoch": 1.892252894033838, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.045311212539673, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8569306135177612, + "num_tokens": 567366402.0, + "step": 14875 + }, + { + "epoch": 1.8923801043124284, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7969428300857544, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8555364608764648, + "num_tokens": 567409924.0, + "step": 14876 + }, + { + "epoch": 1.892507314591019, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9970053434371948, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8676415085792542, + "num_tokens": 567450355.0, + "step": 14877 + }, + { + "epoch": 1.8926345248696095, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3583381175994873, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8765276670455933, + "num_tokens": 567487701.0, + "step": 14878 + }, + { + "epoch": 1.8927617351482, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2452902793884277, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8639299273490906, + "num_tokens": 567527619.0, + "step": 14879 + }, + { + "epoch": 1.8928889454267905, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8873151540756226, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8635253310203552, + "num_tokens": 567567016.0, + "step": 14880 + }, + { + "epoch": 1.8930161557053808, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8315268754959106, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8509312272071838, + "num_tokens": 567605465.0, + "step": 14881 + }, + { + "epoch": 1.8931433659839714, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6644833087921143, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8649213910102844, + "num_tokens": 567646680.0, + "step": 14882 + }, + { + "epoch": 1.893270576262562, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2228779792785645, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8641890287399292, + "num_tokens": 567685685.0, + "step": 14883 + }, + { + "epoch": 1.8933977865411524, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9745218753814697, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8518329858779907, + "num_tokens": 567717222.0, + "step": 14884 + }, + { + "epoch": 1.893524996819743, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8369678258895874, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8539555668830872, + "num_tokens": 567761050.0, + "step": 14885 + }, + { + "epoch": 1.8936522070983335, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9460701942443848, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8604168891906738, + "num_tokens": 567801423.0, + "step": 14886 + }, + { + "epoch": 1.893779417376924, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.003077507019043, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8536887168884277, + "num_tokens": 567844614.0, + "step": 14887 + }, + { + "epoch": 1.8939066276555145, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9655171632766724, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8658559322357178, + "num_tokens": 567878803.0, + "step": 14888 + }, + { + "epoch": 1.894033837934105, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.4927568435668945, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8756922483444214, + "num_tokens": 567909419.0, + "step": 14889 + }, + { + "epoch": 1.8941610482126956, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.910536527633667, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8640881776809692, + "num_tokens": 567949221.0, + "step": 14890 + }, + { + "epoch": 1.8942882584912861, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8565936088562012, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8797525763511658, + "num_tokens": 567985101.0, + "step": 14891 + }, + { + "epoch": 1.8944154687698767, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8052040338516235, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8476583361625671, + "num_tokens": 568030542.0, + "step": 14892 + }, + { + "epoch": 1.8945426790484672, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.77329683303833, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8648926019668579, + "num_tokens": 568070723.0, + "step": 14893 + }, + { + "epoch": 1.8946698893270577, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8336272239685059, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8570901155471802, + "num_tokens": 568113444.0, + "step": 14894 + }, + { + "epoch": 1.8947970996056482, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8024694919586182, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8659448027610779, + "num_tokens": 568152646.0, + "step": 14895 + }, + { + "epoch": 1.8949243098842388, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9058419466018677, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8633471727371216, + "num_tokens": 568187046.0, + "step": 14896 + }, + { + "epoch": 1.8950515201628293, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8791122436523438, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8580650091171265, + "num_tokens": 568225052.0, + "step": 14897 + }, + { + "epoch": 1.8951787304414198, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.005408763885498, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8551608324050903, + "num_tokens": 568258299.0, + "step": 14898 + }, + { + "epoch": 1.8953059407200101, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.285273790359497, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8703010082244873, + "num_tokens": 568297137.0, + "step": 14899 + }, + { + "epoch": 1.8954331509986007, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.024759531021118, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8842660784721375, + "num_tokens": 568331102.0, + "step": 14900 + }, + { + "epoch": 1.8955603612771912, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.045525550842285, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8602953553199768, + "num_tokens": 568373926.0, + "step": 14901 + }, + { + "epoch": 1.8956875715557817, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.185122013092041, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8729443550109863, + "num_tokens": 568406470.0, + "step": 14902 + }, + { + "epoch": 1.8958147818343722, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.893392562866211, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8702927827835083, + "num_tokens": 568443452.0, + "step": 14903 + }, + { + "epoch": 1.8959419921129628, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.927903652191162, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8575201034545898, + "num_tokens": 568480085.0, + "step": 14904 + }, + { + "epoch": 1.896069202391553, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9168204069137573, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8603410124778748, + "num_tokens": 568519191.0, + "step": 14905 + }, + { + "epoch": 1.8961964126701436, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9353569746017456, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8694285154342651, + "num_tokens": 568554224.0, + "step": 14906 + }, + { + "epoch": 1.8963236229487341, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8644208908081055, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.880013644695282, + "num_tokens": 568586102.0, + "step": 14907 + }, + { + "epoch": 1.8964508332273247, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8354692459106445, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8585226535797119, + "num_tokens": 568625224.0, + "step": 14908 + }, + { + "epoch": 1.8965780435059152, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9440393447875977, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8431928753852844, + "num_tokens": 568665588.0, + "step": 14909 + }, + { + "epoch": 1.8967052537845057, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0621423721313477, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8797112107276917, + "num_tokens": 568698408.0, + "step": 14910 + }, + { + "epoch": 1.8968324640630962, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0256154537200928, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8509202599525452, + "num_tokens": 568731982.0, + "step": 14911 + }, + { + "epoch": 1.8969596743416868, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8393511772155762, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8634775876998901, + "num_tokens": 568769388.0, + "step": 14912 + }, + { + "epoch": 1.8970868846202773, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8851345777511597, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8699442148208618, + "num_tokens": 568810329.0, + "step": 14913 + }, + { + "epoch": 1.8972140948988678, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.814405083656311, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8673548102378845, + "num_tokens": 568852241.0, + "step": 14914 + }, + { + "epoch": 1.8973413051774584, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9475433826446533, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8590879440307617, + "num_tokens": 568889660.0, + "step": 14915 + }, + { + "epoch": 1.8974685154560489, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7953215837478638, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8601329326629639, + "num_tokens": 568934489.0, + "step": 14916 + }, + { + "epoch": 1.8975957257346394, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7477445602416992, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8847779035568237, + "num_tokens": 568971904.0, + "step": 14917 + }, + { + "epoch": 1.89772293601323, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1043384075164795, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8448178172111511, + "num_tokens": 569011863.0, + "step": 14918 + }, + { + "epoch": 1.8978501462918205, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.779667615890503, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8846241235733032, + "num_tokens": 569050295.0, + "step": 14919 + }, + { + "epoch": 1.897977356570411, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9731711149215698, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8577414155006409, + "num_tokens": 569087013.0, + "step": 14920 + }, + { + "epoch": 1.8981045668490015, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9588969945907593, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8569807410240173, + "num_tokens": 569125226.0, + "step": 14921 + }, + { + "epoch": 1.898231777127592, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9760280847549438, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8598708510398865, + "num_tokens": 569161910.0, + "step": 14922 + }, + { + "epoch": 1.8983589874061826, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.006577491760254, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8657963871955872, + "num_tokens": 569196519.0, + "step": 14923 + }, + { + "epoch": 1.898486197684773, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8914607763290405, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8601586818695068, + "num_tokens": 569234404.0, + "step": 14924 + }, + { + "epoch": 1.8986134079633634, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9036375284194946, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8560584187507629, + "num_tokens": 569272728.0, + "step": 14925 + }, + { + "epoch": 1.898740618241954, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0433051586151123, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8549827337265015, + "num_tokens": 569315434.0, + "step": 14926 + }, + { + "epoch": 1.8988678285205445, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.973685383796692, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8471648097038269, + "num_tokens": 569357473.0, + "step": 14927 + }, + { + "epoch": 1.898995038799135, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.808980107307434, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8516294956207275, + "num_tokens": 569398747.0, + "step": 14928 + }, + { + "epoch": 1.8991222490777255, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7960346937179565, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.870929479598999, + "num_tokens": 569434855.0, + "step": 14929 + }, + { + "epoch": 1.8992494593563158, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.083766222000122, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8631173372268677, + "num_tokens": 569475236.0, + "step": 14930 + }, + { + "epoch": 1.8993766696349064, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9260163307189941, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.862844705581665, + "num_tokens": 569513599.0, + "step": 14931 + }, + { + "epoch": 1.899503879913497, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.809478998184204, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8714685440063477, + "num_tokens": 569559059.0, + "step": 14932 + }, + { + "epoch": 1.8996310901920874, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7802553176879883, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.881589412689209, + "num_tokens": 569595240.0, + "step": 14933 + }, + { + "epoch": 1.899758300470678, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9581588506698608, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8531243801116943, + "num_tokens": 569629336.0, + "step": 14934 + }, + { + "epoch": 1.8998855107492685, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.842127799987793, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.87542325258255, + "num_tokens": 569663613.0, + "step": 14935 + }, + { + "epoch": 1.900012721027859, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.859336495399475, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8476095199584961, + "num_tokens": 569702884.0, + "step": 14936 + }, + { + "epoch": 1.9001399313064495, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6718374490737915, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8785487413406372, + "num_tokens": 569744618.0, + "step": 14937 + }, + { + "epoch": 1.90026714158504, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9350618124008179, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.87224280834198, + "num_tokens": 569777083.0, + "step": 14938 + }, + { + "epoch": 1.9003943518636306, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9205936193466187, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8702678084373474, + "num_tokens": 569809652.0, + "step": 14939 + }, + { + "epoch": 1.9005215621422211, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3988699913024902, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8686730861663818, + "num_tokens": 569846465.0, + "step": 14940 + }, + { + "epoch": 1.9006487724208116, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9432518482208252, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8558934926986694, + "num_tokens": 569883655.0, + "step": 14941 + }, + { + "epoch": 1.9007759826994022, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9415818452835083, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8630111813545227, + "num_tokens": 569923358.0, + "step": 14942 + }, + { + "epoch": 1.9009031929779927, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8689844608306885, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8603726029396057, + "num_tokens": 569961393.0, + "step": 14943 + }, + { + "epoch": 1.9010304032565832, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.914658784866333, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8360703587532043, + "num_tokens": 570002092.0, + "step": 14944 + }, + { + "epoch": 1.9011576135351738, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9848443269729614, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8474656939506531, + "num_tokens": 570037699.0, + "step": 14945 + }, + { + "epoch": 1.9012848238137643, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8929367065429688, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8393394947052002, + "num_tokens": 570077693.0, + "step": 14946 + }, + { + "epoch": 1.9014120340923548, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7818615436553955, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8826961517333984, + "num_tokens": 570115354.0, + "step": 14947 + }, + { + "epoch": 1.9015392443709451, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7986958026885986, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8708338737487793, + "num_tokens": 570154387.0, + "step": 14948 + }, + { + "epoch": 1.9016664546495357, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9754586219787598, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8577110767364502, + "num_tokens": 570189743.0, + "step": 14949 + }, + { + "epoch": 1.9017936649281262, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9446970224380493, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8753383755683899, + "num_tokens": 570224009.0, + "step": 14950 + }, + { + "epoch": 1.9019208752067167, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6886802911758423, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8706700801849365, + "num_tokens": 570266703.0, + "step": 14951 + }, + { + "epoch": 1.9020480854853072, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0038535594940186, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8631608486175537, + "num_tokens": 570308241.0, + "step": 14952 + }, + { + "epoch": 1.9021752957638978, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0085606575012207, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8629860281944275, + "num_tokens": 570346931.0, + "step": 14953 + }, + { + "epoch": 1.902302506042488, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8744523525238037, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8774243593215942, + "num_tokens": 570380373.0, + "step": 14954 + }, + { + "epoch": 1.9024297163210786, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9797816276550293, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8758281469345093, + "num_tokens": 570420106.0, + "step": 14955 + }, + { + "epoch": 1.9025569265996691, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.657732605934143, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8846566677093506, + "num_tokens": 570464534.0, + "step": 14956 + }, + { + "epoch": 1.9026841368782597, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7932727336883545, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8535096645355225, + "num_tokens": 570505734.0, + "step": 14957 + }, + { + "epoch": 1.9028113471568502, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.707745909690857, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8730740547180176, + "num_tokens": 570545913.0, + "step": 14958 + }, + { + "epoch": 1.9029385574354407, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7312794923782349, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8718403577804565, + "num_tokens": 570584397.0, + "step": 14959 + }, + { + "epoch": 1.9030657677140312, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7245383262634277, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.870668888092041, + "num_tokens": 570622285.0, + "step": 14960 + }, + { + "epoch": 1.9031929779926218, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9426556825637817, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8486677408218384, + "num_tokens": 570656331.0, + "step": 14961 + }, + { + "epoch": 1.9033201882712123, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9064959287643433, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8692313432693481, + "num_tokens": 570699944.0, + "step": 14962 + }, + { + "epoch": 1.9034473985498028, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9271100759506226, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8583565354347229, + "num_tokens": 570737649.0, + "step": 14963 + }, + { + "epoch": 1.9035746088283934, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8512521982192993, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8632662296295166, + "num_tokens": 570773193.0, + "step": 14964 + }, + { + "epoch": 1.9037018191069839, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.268796443939209, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8586567044258118, + "num_tokens": 570809117.0, + "step": 14965 + }, + { + "epoch": 1.9038290293855744, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.130859851837158, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.875913143157959, + "num_tokens": 570844268.0, + "step": 14966 + }, + { + "epoch": 1.903956239664165, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1588945388793945, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8554891347885132, + "num_tokens": 570890166.0, + "step": 14967 + }, + { + "epoch": 1.9040834499427555, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1090142726898193, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8575330972671509, + "num_tokens": 570921419.0, + "step": 14968 + }, + { + "epoch": 1.904210660221346, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7876075506210327, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.855836033821106, + "num_tokens": 570966444.0, + "step": 14969 + }, + { + "epoch": 1.9043378704999365, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7141505479812622, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8740021586418152, + "num_tokens": 571011656.0, + "step": 14970 + }, + { + "epoch": 1.904465080778527, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0529489517211914, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8694199323654175, + "num_tokens": 571047828.0, + "step": 14971 + }, + { + "epoch": 1.9045922910571176, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.874354362487793, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8611922264099121, + "num_tokens": 571084541.0, + "step": 14972 + }, + { + "epoch": 1.9047195013357079, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9261231422424316, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.858662486076355, + "num_tokens": 571120978.0, + "step": 14973 + }, + { + "epoch": 1.9048467116142984, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7709277868270874, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8761458396911621, + "num_tokens": 571160181.0, + "step": 14974 + }, + { + "epoch": 1.904973921892889, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9160122871398926, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8688445091247559, + "num_tokens": 571194648.0, + "step": 14975 + }, + { + "epoch": 1.9051011321714795, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7835692167282104, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8575941920280457, + "num_tokens": 571238449.0, + "step": 14976 + }, + { + "epoch": 1.90522834245007, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.042078971862793, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8645734786987305, + "num_tokens": 571277266.0, + "step": 14977 + }, + { + "epoch": 1.9053555527286605, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.926169753074646, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8752729892730713, + "num_tokens": 571316891.0, + "step": 14978 + }, + { + "epoch": 1.9054827630072508, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3893072605133057, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8645765781402588, + "num_tokens": 571353097.0, + "step": 14979 + }, + { + "epoch": 1.9056099732858414, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.86772882938385, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.878505289554596, + "num_tokens": 571392217.0, + "step": 14980 + }, + { + "epoch": 1.905737183564432, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7931610345840454, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8731387257575989, + "num_tokens": 571430440.0, + "step": 14981 + }, + { + "epoch": 1.9058643938430224, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9092998504638672, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8600397109985352, + "num_tokens": 571472002.0, + "step": 14982 + }, + { + "epoch": 1.905991604121613, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0644290447235107, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8453481793403625, + "num_tokens": 571509368.0, + "step": 14983 + }, + { + "epoch": 1.9061188144002035, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.031633138656616, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8491292595863342, + "num_tokens": 571543898.0, + "step": 14984 + }, + { + "epoch": 1.906246024678794, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8849248886108398, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8580029010772705, + "num_tokens": 571584395.0, + "step": 14985 + }, + { + "epoch": 1.9063732349573845, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8839592933654785, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8643958568572998, + "num_tokens": 571624792.0, + "step": 14986 + }, + { + "epoch": 1.906500445235975, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6840943098068237, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8663976788520813, + "num_tokens": 571667197.0, + "step": 14987 + }, + { + "epoch": 1.9066276555145656, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9035584926605225, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8581081032752991, + "num_tokens": 571703648.0, + "step": 14988 + }, + { + "epoch": 1.9067548657931561, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0855631828308105, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8544768691062927, + "num_tokens": 571738980.0, + "step": 14989 + }, + { + "epoch": 1.9068820760717466, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1132123470306396, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8657991886138916, + "num_tokens": 571774540.0, + "step": 14990 + }, + { + "epoch": 1.9070092863503372, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.99899160861969, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8707925081253052, + "num_tokens": 571805286.0, + "step": 14991 + }, + { + "epoch": 1.9071364966289277, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9272773265838623, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8567914962768555, + "num_tokens": 571847936.0, + "step": 14992 + }, + { + "epoch": 1.9072637069075182, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.122598171234131, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8593515753746033, + "num_tokens": 571885454.0, + "step": 14993 + }, + { + "epoch": 1.9073909171861088, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9108701944351196, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8625193238258362, + "num_tokens": 571924870.0, + "step": 14994 + }, + { + "epoch": 1.9075181274646993, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0335283279418945, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8409979939460754, + "num_tokens": 571957870.0, + "step": 14995 + }, + { + "epoch": 1.9076453377432898, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9605356454849243, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8548341393470764, + "num_tokens": 571993645.0, + "step": 14996 + }, + { + "epoch": 1.9077725480218801, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.994044303894043, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8710426688194275, + "num_tokens": 572028594.0, + "step": 14997 + }, + { + "epoch": 1.9078997583004706, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0892062187194824, + "learning_rate": 1e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8360408544540405, + "num_tokens": 572066971.0, + "step": 14998 + }, + { + "epoch": 1.9080269685790612, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9154094457626343, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8690206408500671, + "num_tokens": 572107609.0, + "step": 14999 + }, + { + "epoch": 1.9081541788576517, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8972505331039429, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8712157607078552, + "num_tokens": 572146940.0, + "step": 15000 + }, + { + "epoch": 1.9082813891362422, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8636142015457153, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8876185417175293, + "num_tokens": 572184010.0, + "step": 15001 + }, + { + "epoch": 1.9084085994148328, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.075789213180542, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8476120829582214, + "num_tokens": 572223184.0, + "step": 15002 + }, + { + "epoch": 1.908535809693423, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1503963470458984, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8632349967956543, + "num_tokens": 572261770.0, + "step": 15003 + }, + { + "epoch": 1.9086630199720136, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.30543851852417, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8657238483428955, + "num_tokens": 572303229.0, + "step": 15004 + }, + { + "epoch": 1.9087902302506041, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2245755195617676, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.85155189037323, + "num_tokens": 572337839.0, + "step": 15005 + }, + { + "epoch": 1.9089174405291947, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9695546627044678, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8617578744888306, + "num_tokens": 572373604.0, + "step": 15006 + }, + { + "epoch": 1.9090446508077852, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9037419557571411, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8701645135879517, + "num_tokens": 572410425.0, + "step": 15007 + }, + { + "epoch": 1.9091718610863757, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.958919644355774, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8725446462631226, + "num_tokens": 572443998.0, + "step": 15008 + }, + { + "epoch": 1.9092990713649662, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9449009895324707, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8511847853660583, + "num_tokens": 572485337.0, + "step": 15009 + }, + { + "epoch": 1.9094262816435568, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8477035760879517, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8533253073692322, + "num_tokens": 572528273.0, + "step": 15010 + }, + { + "epoch": 1.9095534919221473, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8676202297210693, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8671068549156189, + "num_tokens": 572569467.0, + "step": 15011 + }, + { + "epoch": 1.9096807022007378, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8259410858154297, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8708676695823669, + "num_tokens": 572609626.0, + "step": 15012 + }, + { + "epoch": 1.9098079124793284, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.902422547340393, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8474295139312744, + "num_tokens": 572652907.0, + "step": 15013 + }, + { + "epoch": 1.9099351227579189, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 16.640911102294922, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8695342540740967, + "num_tokens": 572687460.0, + "step": 15014 + }, + { + "epoch": 1.9100623330365094, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.117837905883789, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8671584725379944, + "num_tokens": 572723937.0, + "step": 15015 + }, + { + "epoch": 1.9101895433151, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8544000387191772, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.858819305896759, + "num_tokens": 572764929.0, + "step": 15016 + }, + { + "epoch": 1.9103167535936905, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9489212036132812, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8621878027915955, + "num_tokens": 572802545.0, + "step": 15017 + }, + { + "epoch": 1.910443963872281, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6756575107574463, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8587095737457275, + "num_tokens": 572850008.0, + "step": 15018 + }, + { + "epoch": 1.9105711741508715, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8279829025268555, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8720595836639404, + "num_tokens": 572888968.0, + "step": 15019 + }, + { + "epoch": 1.910698384429462, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0553030967712402, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8614922165870667, + "num_tokens": 572926314.0, + "step": 15020 + }, + { + "epoch": 1.9108255947080524, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.019613742828369, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8688744306564331, + "num_tokens": 572961789.0, + "step": 15021 + }, + { + "epoch": 1.9109528049866429, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1511070728302, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.865848183631897, + "num_tokens": 572993680.0, + "step": 15022 + }, + { + "epoch": 1.9110800152652334, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9852486848831177, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8656935095787048, + "num_tokens": 573024003.0, + "step": 15023 + }, + { + "epoch": 1.911207225543824, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9640761613845825, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8697313070297241, + "num_tokens": 573055138.0, + "step": 15024 + }, + { + "epoch": 1.9113344358224145, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9189144372940063, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8670859336853027, + "num_tokens": 573092373.0, + "step": 15025 + }, + { + "epoch": 1.911461646101005, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8390350341796875, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8603301048278809, + "num_tokens": 573131486.0, + "step": 15026 + }, + { + "epoch": 1.9115888563795955, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8157851696014404, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8736098408699036, + "num_tokens": 573170114.0, + "step": 15027 + }, + { + "epoch": 1.9117160666581858, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.44143009185791, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8656586408615112, + "num_tokens": 573200906.0, + "step": 15028 + }, + { + "epoch": 1.9118432769367764, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9474366903305054, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8519301414489746, + "num_tokens": 573243731.0, + "step": 15029 + }, + { + "epoch": 1.9119704872153669, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8809922933578491, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8704841136932373, + "num_tokens": 573283335.0, + "step": 15030 + }, + { + "epoch": 1.9120976974939574, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7889010906219482, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8508731126785278, + "num_tokens": 573326628.0, + "step": 15031 + }, + { + "epoch": 1.912224907772548, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9597214460372925, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8691117167472839, + "num_tokens": 573369025.0, + "step": 15032 + }, + { + "epoch": 1.9123521180511385, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7936816215515137, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8558937907218933, + "num_tokens": 573411923.0, + "step": 15033 + }, + { + "epoch": 1.912479328329729, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.354585647583008, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8585731983184814, + "num_tokens": 573450455.0, + "step": 15034 + }, + { + "epoch": 1.9126065386083195, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9233314990997314, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8555596470832825, + "num_tokens": 573492886.0, + "step": 15035 + }, + { + "epoch": 1.91273374888691, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8310519456863403, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8687953948974609, + "num_tokens": 573532073.0, + "step": 15036 + }, + { + "epoch": 1.9128609591655006, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7028547525405884, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8792182803153992, + "num_tokens": 573573632.0, + "step": 15037 + }, + { + "epoch": 1.9129881694440911, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8471519947052002, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8727155327796936, + "num_tokens": 573610194.0, + "step": 15038 + }, + { + "epoch": 1.9131153797226816, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8363100290298462, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8557621240615845, + "num_tokens": 573650237.0, + "step": 15039 + }, + { + "epoch": 1.9132425900012722, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9406217336654663, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8666530847549438, + "num_tokens": 573686859.0, + "step": 15040 + }, + { + "epoch": 1.9133698002798627, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8858166933059692, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8631269335746765, + "num_tokens": 573726770.0, + "step": 15041 + }, + { + "epoch": 1.9134970105584532, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8479259014129639, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8581352233886719, + "num_tokens": 573768668.0, + "step": 15042 + }, + { + "epoch": 1.9136242208370438, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.904911994934082, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8543948531150818, + "num_tokens": 573807880.0, + "step": 15043 + }, + { + "epoch": 1.9137514311156343, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8676501512527466, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8625426292419434, + "num_tokens": 573847018.0, + "step": 15044 + }, + { + "epoch": 1.9138786413942248, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 9.556619644165039, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8789534568786621, + "num_tokens": 573881193.0, + "step": 15045 + }, + { + "epoch": 1.9140058516728151, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.146165370941162, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8724538087844849, + "num_tokens": 573918457.0, + "step": 15046 + }, + { + "epoch": 1.9141330619514056, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8039109706878662, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8842986822128296, + "num_tokens": 573958818.0, + "step": 15047 + }, + { + "epoch": 1.9142602722299962, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9150677919387817, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8594181537628174, + "num_tokens": 573996735.0, + "step": 15048 + }, + { + "epoch": 1.9143874825085867, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7820039987564087, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8681115508079529, + "num_tokens": 574035005.0, + "step": 15049 + }, + { + "epoch": 1.9145146927871772, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6385952234268188, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8628463745117188, + "num_tokens": 574077817.0, + "step": 15050 + }, + { + "epoch": 1.9146419030657678, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.789466142654419, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8708395957946777, + "num_tokens": 574113083.0, + "step": 15051 + }, + { + "epoch": 1.914769113344358, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9079973697662354, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8682056069374084, + "num_tokens": 574147324.0, + "step": 15052 + }, + { + "epoch": 1.9148963236229486, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7691423892974854, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8805086016654968, + "num_tokens": 574186108.0, + "step": 15053 + }, + { + "epoch": 1.9150235339015391, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9896392822265625, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8758898973464966, + "num_tokens": 574218076.0, + "step": 15054 + }, + { + "epoch": 1.9151507441801296, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.896089792251587, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8742265701293945, + "num_tokens": 574256416.0, + "step": 15055 + }, + { + "epoch": 1.9152779544587202, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7895172834396362, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.876776933670044, + "num_tokens": 574295262.0, + "step": 15056 + }, + { + "epoch": 1.9154051647373107, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9250617027282715, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8531282544136047, + "num_tokens": 574329836.0, + "step": 15057 + }, + { + "epoch": 1.9155323750159012, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.929714560508728, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.866518497467041, + "num_tokens": 574363189.0, + "step": 15058 + }, + { + "epoch": 1.9156595852944918, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3123178482055664, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8728430271148682, + "num_tokens": 574396374.0, + "step": 15059 + }, + { + "epoch": 1.9157867955730823, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0398929119110107, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8660206198692322, + "num_tokens": 574435488.0, + "step": 15060 + }, + { + "epoch": 1.9159140058516728, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9215067625045776, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8633789420127869, + "num_tokens": 574475261.0, + "step": 15061 + }, + { + "epoch": 1.9160412161302633, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6986761093139648, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.884247899055481, + "num_tokens": 574513531.0, + "step": 15062 + }, + { + "epoch": 1.9161684264088539, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.976128339767456, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8506177663803101, + "num_tokens": 574551736.0, + "step": 15063 + }, + { + "epoch": 1.9162956366874444, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9251441955566406, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8579230308532715, + "num_tokens": 574589978.0, + "step": 15064 + }, + { + "epoch": 1.916422846966035, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9283056259155273, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8433274626731873, + "num_tokens": 574628845.0, + "step": 15065 + }, + { + "epoch": 1.9165500572446255, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.012377977371216, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8618466258049011, + "num_tokens": 574660700.0, + "step": 15066 + }, + { + "epoch": 1.916677267523216, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9255599975585938, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8566145896911621, + "num_tokens": 574692195.0, + "step": 15067 + }, + { + "epoch": 1.9168044778018065, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0128426551818848, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8537174463272095, + "num_tokens": 574732462.0, + "step": 15068 + }, + { + "epoch": 1.916931688080397, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8111920356750488, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8707975149154663, + "num_tokens": 574769353.0, + "step": 15069 + }, + { + "epoch": 1.9170588983589874, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.864627718925476, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8651074171066284, + "num_tokens": 574804958.0, + "step": 15070 + }, + { + "epoch": 1.9171861086375779, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8433454036712646, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8640186786651611, + "num_tokens": 574843966.0, + "step": 15071 + }, + { + "epoch": 1.9173133189161684, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8981417417526245, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.869372546672821, + "num_tokens": 574882916.0, + "step": 15072 + }, + { + "epoch": 1.917440529194759, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8716018199920654, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.87223219871521, + "num_tokens": 574922154.0, + "step": 15073 + }, + { + "epoch": 1.9175677394733495, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9198875427246094, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8603465557098389, + "num_tokens": 574956894.0, + "step": 15074 + }, + { + "epoch": 1.91769494975194, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8045084476470947, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8781747817993164, + "num_tokens": 574998343.0, + "step": 15075 + }, + { + "epoch": 1.9178221600305305, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8232771158218384, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8732216358184814, + "num_tokens": 575036689.0, + "step": 15076 + }, + { + "epoch": 1.9179493703091208, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.850592851638794, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8583358526229858, + "num_tokens": 575075510.0, + "step": 15077 + }, + { + "epoch": 1.9180765805877114, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8037713766098022, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8605703115463257, + "num_tokens": 575115797.0, + "step": 15078 + }, + { + "epoch": 1.9182037908663019, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9182735681533813, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.850260853767395, + "num_tokens": 575155240.0, + "step": 15079 + }, + { + "epoch": 1.9183310011448924, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.054500102996826, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8521851897239685, + "num_tokens": 575191048.0, + "step": 15080 + }, + { + "epoch": 1.918458211423483, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9841183423995972, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8524312973022461, + "num_tokens": 575227778.0, + "step": 15081 + }, + { + "epoch": 1.9185854217020735, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8000565767288208, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8649046421051025, + "num_tokens": 575269774.0, + "step": 15082 + }, + { + "epoch": 1.918712631980664, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9592866897583008, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.854453444480896, + "num_tokens": 575310710.0, + "step": 15083 + }, + { + "epoch": 1.9188398422592545, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8667742013931274, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8712095022201538, + "num_tokens": 575344073.0, + "step": 15084 + }, + { + "epoch": 1.918967052537845, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.848593831062317, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8535515666007996, + "num_tokens": 575386737.0, + "step": 15085 + }, + { + "epoch": 1.9190942628164356, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8352888822555542, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8684996962547302, + "num_tokens": 575424287.0, + "step": 15086 + }, + { + "epoch": 1.919221473095026, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0324654579162598, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8654958009719849, + "num_tokens": 575463859.0, + "step": 15087 + }, + { + "epoch": 1.9193486833736166, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.965633749961853, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8581101894378662, + "num_tokens": 575500838.0, + "step": 15088 + }, + { + "epoch": 1.9194758936522072, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8876402378082275, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8558326959609985, + "num_tokens": 575535164.0, + "step": 15089 + }, + { + "epoch": 1.9196031039307977, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8842999935150146, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8694677948951721, + "num_tokens": 575569493.0, + "step": 15090 + }, + { + "epoch": 1.9197303142093882, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1893186569213867, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8462176322937012, + "num_tokens": 575603396.0, + "step": 15091 + }, + { + "epoch": 1.9198575244879788, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8882955312728882, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8709226846694946, + "num_tokens": 575638255.0, + "step": 15092 + }, + { + "epoch": 1.9199847347665693, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7220665216445923, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8544835448265076, + "num_tokens": 575685281.0, + "step": 15093 + }, + { + "epoch": 1.9201119450451598, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9855353832244873, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8707098364830017, + "num_tokens": 575725030.0, + "step": 15094 + }, + { + "epoch": 1.9202391553237501, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7925325632095337, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8579330444335938, + "num_tokens": 575764237.0, + "step": 15095 + }, + { + "epoch": 1.9203663656023406, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8849689960479736, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8619085550308228, + "num_tokens": 575796732.0, + "step": 15096 + }, + { + "epoch": 1.9204935758809312, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9466488361358643, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8683934211730957, + "num_tokens": 575833133.0, + "step": 15097 + }, + { + "epoch": 1.9206207861595217, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8115326166152954, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8594846725463867, + "num_tokens": 575870920.0, + "step": 15098 + }, + { + "epoch": 1.9207479964381122, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.860973596572876, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8667259812355042, + "num_tokens": 575909132.0, + "step": 15099 + }, + { + "epoch": 1.9208752067167028, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8347021341323853, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8642359972000122, + "num_tokens": 575947979.0, + "step": 15100 + }, + { + "epoch": 1.921002416995293, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9720722436904907, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8528424501419067, + "num_tokens": 575988888.0, + "step": 15101 + }, + { + "epoch": 1.9211296272738836, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8672393560409546, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8655039072036743, + "num_tokens": 576028614.0, + "step": 15102 + }, + { + "epoch": 1.9212568375524741, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8740087747573853, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8918485045433044, + "num_tokens": 576066648.0, + "step": 15103 + }, + { + "epoch": 1.9213840478310646, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9638352394104004, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8605993986129761, + "num_tokens": 576104045.0, + "step": 15104 + }, + { + "epoch": 1.9215112581096552, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.85025155544281, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8580144047737122, + "num_tokens": 576145419.0, + "step": 15105 + }, + { + "epoch": 1.9216384683882457, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7501753568649292, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8605489730834961, + "num_tokens": 576191410.0, + "step": 15106 + }, + { + "epoch": 1.9217656786668362, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9917919635772705, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8592391014099121, + "num_tokens": 576226383.0, + "step": 15107 + }, + { + "epoch": 1.9218928889454268, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0748329162597656, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8563566207885742, + "num_tokens": 576259958.0, + "step": 15108 + }, + { + "epoch": 1.9220200992240173, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.905308485031128, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8673141002655029, + "num_tokens": 576299467.0, + "step": 15109 + }, + { + "epoch": 1.9221473095026078, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9202370643615723, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8590655326843262, + "num_tokens": 576334137.0, + "step": 15110 + }, + { + "epoch": 1.9222745197811983, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.812435269355774, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8602452874183655, + "num_tokens": 576379582.0, + "step": 15111 + }, + { + "epoch": 1.9224017300597889, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1531105041503906, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8779875636100769, + "num_tokens": 576414590.0, + "step": 15112 + }, + { + "epoch": 1.9225289403383794, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.086869239807129, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8543503880500793, + "num_tokens": 576445023.0, + "step": 15113 + }, + { + "epoch": 1.92265615061697, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.104790449142456, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8593007922172546, + "num_tokens": 576477861.0, + "step": 15114 + }, + { + "epoch": 1.9227833608955605, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0883982181549072, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8595379590988159, + "num_tokens": 576513121.0, + "step": 15115 + }, + { + "epoch": 1.922910571174151, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8527261018753052, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8689603209495544, + "num_tokens": 576551039.0, + "step": 15116 + }, + { + "epoch": 1.9230377814527415, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0468506813049316, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8563553094863892, + "num_tokens": 576590651.0, + "step": 15117 + }, + { + "epoch": 1.923164991731332, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.575965166091919, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8552049398422241, + "num_tokens": 576629119.0, + "step": 15118 + }, + { + "epoch": 1.9232922020099223, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0872082710266113, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8719260692596436, + "num_tokens": 576664240.0, + "step": 15119 + }, + { + "epoch": 1.9234194122885129, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9680790901184082, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8573269844055176, + "num_tokens": 576701946.0, + "step": 15120 + }, + { + "epoch": 1.9235466225671034, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.116076946258545, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8740040063858032, + "num_tokens": 576733541.0, + "step": 15121 + }, + { + "epoch": 1.923673832845694, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.211118221282959, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8488638997077942, + "num_tokens": 576770930.0, + "step": 15122 + }, + { + "epoch": 1.9238010431242845, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.175116777420044, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8642725944519043, + "num_tokens": 576807455.0, + "step": 15123 + }, + { + "epoch": 1.923928253402875, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7954069375991821, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.870863676071167, + "num_tokens": 576849577.0, + "step": 15124 + }, + { + "epoch": 1.9240554636814655, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.049480438232422, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8569343686103821, + "num_tokens": 576883506.0, + "step": 15125 + }, + { + "epoch": 1.9241826739600558, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.5895752906799316, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8471732139587402, + "num_tokens": 576923892.0, + "step": 15126 + }, + { + "epoch": 1.9243098842386464, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.793655514717102, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8811957836151123, + "num_tokens": 576970832.0, + "step": 15127 + }, + { + "epoch": 1.9244370945172369, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 20.476579666137695, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8448599576950073, + "num_tokens": 577006990.0, + "step": 15128 + }, + { + "epoch": 1.9245643047958274, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9401861429214478, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8693829774856567, + "num_tokens": 577044932.0, + "step": 15129 + }, + { + "epoch": 1.924691515074418, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.174222707748413, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8469464182853699, + "num_tokens": 577076217.0, + "step": 15130 + }, + { + "epoch": 1.9248187253530085, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8205307722091675, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8797262907028198, + "num_tokens": 577112696.0, + "step": 15131 + }, + { + "epoch": 1.924945935631599, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.45784068107605, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8636417984962463, + "num_tokens": 577156397.0, + "step": 15132 + }, + { + "epoch": 1.9250731459101895, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8344162702560425, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8678035140037537, + "num_tokens": 577194192.0, + "step": 15133 + }, + { + "epoch": 1.92520035618878, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8476130962371826, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8587673902511597, + "num_tokens": 577232639.0, + "step": 15134 + }, + { + "epoch": 1.9253275664673706, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0468409061431885, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8722507953643799, + "num_tokens": 577267349.0, + "step": 15135 + }, + { + "epoch": 1.925454776745961, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.844553828239441, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8598193526268005, + "num_tokens": 577308329.0, + "step": 15136 + }, + { + "epoch": 1.9255819870245516, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.047227144241333, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8646407127380371, + "num_tokens": 577347825.0, + "step": 15137 + }, + { + "epoch": 1.9257091973031422, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0123322010040283, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8636904954910278, + "num_tokens": 577379981.0, + "step": 15138 + }, + { + "epoch": 1.9258364075817327, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.911681890487671, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8686342239379883, + "num_tokens": 577414134.0, + "step": 15139 + }, + { + "epoch": 1.9259636178603232, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8491528034210205, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8620641827583313, + "num_tokens": 577451170.0, + "step": 15140 + }, + { + "epoch": 1.9260908281389137, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.967107892036438, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8474377393722534, + "num_tokens": 577487447.0, + "step": 15141 + }, + { + "epoch": 1.9262180384175043, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8266708850860596, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8680875301361084, + "num_tokens": 577525732.0, + "step": 15142 + }, + { + "epoch": 1.9263452486960948, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7704344987869263, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8628297448158264, + "num_tokens": 577563303.0, + "step": 15143 + }, + { + "epoch": 1.926472458974685, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8222309350967407, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.877682626247406, + "num_tokens": 577607592.0, + "step": 15144 + }, + { + "epoch": 1.9265996692532756, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8649686574935913, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8545961380004883, + "num_tokens": 577651646.0, + "step": 15145 + }, + { + "epoch": 1.9267268795318662, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1177525520324707, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8720658421516418, + "num_tokens": 577689530.0, + "step": 15146 + }, + { + "epoch": 1.9268540898104567, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8666539192199707, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8576103448867798, + "num_tokens": 577732524.0, + "step": 15147 + }, + { + "epoch": 1.9269813000890472, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9449669122695923, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8675152659416199, + "num_tokens": 577767164.0, + "step": 15148 + }, + { + "epoch": 1.9271085103676378, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.053246259689331, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.87117999792099, + "num_tokens": 577802347.0, + "step": 15149 + }, + { + "epoch": 1.927235720646228, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9069538116455078, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8550660610198975, + "num_tokens": 577842899.0, + "step": 15150 + }, + { + "epoch": 1.9273629309248186, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8794022798538208, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8630928993225098, + "num_tokens": 577885137.0, + "step": 15151 + }, + { + "epoch": 1.9274901412034091, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.735377311706543, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8768715858459473, + "num_tokens": 577925727.0, + "step": 15152 + }, + { + "epoch": 1.9276173514819996, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7208139896392822, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8681504726409912, + "num_tokens": 577965399.0, + "step": 15153 + }, + { + "epoch": 1.9277445617605902, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8556181192398071, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8624647259712219, + "num_tokens": 578007608.0, + "step": 15154 + }, + { + "epoch": 1.9278717720391807, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7995446920394897, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8739588260650635, + "num_tokens": 578043425.0, + "step": 15155 + }, + { + "epoch": 1.9279989823177712, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.948102355003357, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8593119382858276, + "num_tokens": 578075838.0, + "step": 15156 + }, + { + "epoch": 1.9281261925963618, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7979545593261719, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8583475351333618, + "num_tokens": 578110113.0, + "step": 15157 + }, + { + "epoch": 1.9282534028749523, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.823534607887268, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8633878231048584, + "num_tokens": 578145743.0, + "step": 15158 + }, + { + "epoch": 1.9283806131535428, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9310948848724365, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.870868980884552, + "num_tokens": 578181588.0, + "step": 15159 + }, + { + "epoch": 1.9285078234321333, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.845184326171875, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8616077899932861, + "num_tokens": 578222048.0, + "step": 15160 + }, + { + "epoch": 1.9286350337107239, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0107498168945312, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8663798570632935, + "num_tokens": 578262664.0, + "step": 15161 + }, + { + "epoch": 1.9287622439893144, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.862141728401184, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.85371994972229, + "num_tokens": 578303431.0, + "step": 15162 + }, + { + "epoch": 1.928889454267905, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9867949485778809, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8537393808364868, + "num_tokens": 578338978.0, + "step": 15163 + }, + { + "epoch": 1.9290166645464955, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8358418941497803, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8594115972518921, + "num_tokens": 578376340.0, + "step": 15164 + }, + { + "epoch": 1.929143874825086, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8361741304397583, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8645231127738953, + "num_tokens": 578414714.0, + "step": 15165 + }, + { + "epoch": 1.9292710851036765, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 7.733870506286621, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.849707841873169, + "num_tokens": 578453668.0, + "step": 15166 + }, + { + "epoch": 1.929398295382267, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.915384292602539, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8551619052886963, + "num_tokens": 578494148.0, + "step": 15167 + }, + { + "epoch": 1.9295255056608573, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.254549503326416, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8425074815750122, + "num_tokens": 578530743.0, + "step": 15168 + }, + { + "epoch": 1.9296527159394479, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8459452390670776, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8851000666618347, + "num_tokens": 578567613.0, + "step": 15169 + }, + { + "epoch": 1.9297799262180384, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.027283191680908, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8616576194763184, + "num_tokens": 578602981.0, + "step": 15170 + }, + { + "epoch": 1.929907136496629, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8790054321289062, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8570564985275269, + "num_tokens": 578641027.0, + "step": 15171 + }, + { + "epoch": 1.9300343467752195, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7228730916976929, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8763335943222046, + "num_tokens": 578683006.0, + "step": 15172 + }, + { + "epoch": 1.93016155705381, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.01745867729187, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8665139675140381, + "num_tokens": 578716722.0, + "step": 15173 + }, + { + "epoch": 1.9302887673324005, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9402841329574585, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8437514305114746, + "num_tokens": 578750740.0, + "step": 15174 + }, + { + "epoch": 1.9304159776109908, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.832733392715454, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8660182952880859, + "num_tokens": 578791358.0, + "step": 15175 + }, + { + "epoch": 1.9305431878895813, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7978581190109253, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8475772142410278, + "num_tokens": 578834514.0, + "step": 15176 + }, + { + "epoch": 1.9306703981681719, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7521106004714966, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8560938835144043, + "num_tokens": 578875869.0, + "step": 15177 + }, + { + "epoch": 1.9307976084467624, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7079042196273804, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8666788339614868, + "num_tokens": 578921079.0, + "step": 15178 + }, + { + "epoch": 1.930924818725353, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7831320762634277, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8619925379753113, + "num_tokens": 578959837.0, + "step": 15179 + }, + { + "epoch": 1.9310520290039435, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.880660057067871, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8349011540412903, + "num_tokens": 579001629.0, + "step": 15180 + }, + { + "epoch": 1.931179239282534, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8243407011032104, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8863496780395508, + "num_tokens": 579036762.0, + "step": 15181 + }, + { + "epoch": 1.9313064495611245, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1196181774139404, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.842450737953186, + "num_tokens": 579078455.0, + "step": 15182 + }, + { + "epoch": 1.931433659839715, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8097114562988281, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8664682507514954, + "num_tokens": 579116942.0, + "step": 15183 + }, + { + "epoch": 1.9315608701183056, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8169527053833008, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.877094030380249, + "num_tokens": 579155180.0, + "step": 15184 + }, + { + "epoch": 1.931688080396896, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 7.815306186676025, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8562151193618774, + "num_tokens": 579194336.0, + "step": 15185 + }, + { + "epoch": 1.9318152906754866, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1904921531677246, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8583881855010986, + "num_tokens": 579226609.0, + "step": 15186 + }, + { + "epoch": 1.9319425009540772, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1710116863250732, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8681381344795227, + "num_tokens": 579259288.0, + "step": 15187 + }, + { + "epoch": 1.9320697112326677, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0116000175476074, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8714569807052612, + "num_tokens": 579294797.0, + "step": 15188 + }, + { + "epoch": 1.9321969215112582, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7568625211715698, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8726012110710144, + "num_tokens": 579334352.0, + "step": 15189 + }, + { + "epoch": 1.9323241317898487, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.756982684135437, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8521944880485535, + "num_tokens": 579379618.0, + "step": 15190 + }, + { + "epoch": 1.9324513420684393, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8059934377670288, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8621450662612915, + "num_tokens": 579420230.0, + "step": 15191 + }, + { + "epoch": 1.9325785523470298, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9917556047439575, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8629142045974731, + "num_tokens": 579458236.0, + "step": 15192 + }, + { + "epoch": 1.93270576262562, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8951189517974854, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8627538681030273, + "num_tokens": 579495561.0, + "step": 15193 + }, + { + "epoch": 1.9328329729042106, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.068216562271118, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8614623546600342, + "num_tokens": 579530225.0, + "step": 15194 + }, + { + "epoch": 1.9329601831828012, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9536216259002686, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8547512292861938, + "num_tokens": 579570143.0, + "step": 15195 + }, + { + "epoch": 1.9330873934613917, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7350122928619385, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8772017955780029, + "num_tokens": 579607105.0, + "step": 15196 + }, + { + "epoch": 1.9332146037399822, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8182982206344604, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8786473870277405, + "num_tokens": 579640018.0, + "step": 15197 + }, + { + "epoch": 1.9333418140185727, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9147049188613892, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8580133318901062, + "num_tokens": 579681250.0, + "step": 15198 + }, + { + "epoch": 1.933469024297163, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.866018533706665, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8535130620002747, + "num_tokens": 579718535.0, + "step": 15199 + }, + { + "epoch": 1.9335962345757536, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9152334928512573, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8518682718276978, + "num_tokens": 579756959.0, + "step": 15200 + }, + { + "epoch": 1.933723444854344, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7522759437561035, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8552626371383667, + "num_tokens": 579798266.0, + "step": 15201 + }, + { + "epoch": 1.9338506551329346, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8960117101669312, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.882331371307373, + "num_tokens": 579831869.0, + "step": 15202 + }, + { + "epoch": 1.9339778654115252, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7674825191497803, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8653236627578735, + "num_tokens": 579874168.0, + "step": 15203 + }, + { + "epoch": 1.9341050756901157, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8625558614730835, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8692064881324768, + "num_tokens": 579916313.0, + "step": 15204 + }, + { + "epoch": 1.9342322859687062, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.700760006904602, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8750481009483337, + "num_tokens": 579959022.0, + "step": 15205 + }, + { + "epoch": 1.9343594962472968, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8900095224380493, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8489958047866821, + "num_tokens": 580003675.0, + "step": 15206 + }, + { + "epoch": 1.9344867065258873, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6545926332473755, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8724124431610107, + "num_tokens": 580044384.0, + "step": 15207 + }, + { + "epoch": 1.9346139168044778, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8906466960906982, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8523120284080505, + "num_tokens": 580078955.0, + "step": 15208 + }, + { + "epoch": 1.9347411270830683, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9913820028305054, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8612379431724548, + "num_tokens": 580116793.0, + "step": 15209 + }, + { + "epoch": 1.9348683373616589, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2599127292633057, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8628292679786682, + "num_tokens": 580153622.0, + "step": 15210 + }, + { + "epoch": 1.9349955476402494, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8305553197860718, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8611046671867371, + "num_tokens": 580195287.0, + "step": 15211 + }, + { + "epoch": 1.93512275791884, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9416840076446533, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8502237200737, + "num_tokens": 580229601.0, + "step": 15212 + }, + { + "epoch": 1.9352499681974304, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7733129262924194, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8663181662559509, + "num_tokens": 580268079.0, + "step": 15213 + }, + { + "epoch": 1.935377178476021, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.931566834449768, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8452354669570923, + "num_tokens": 580305379.0, + "step": 15214 + }, + { + "epoch": 1.9355043887546115, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1210646629333496, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8618420362472534, + "num_tokens": 580338438.0, + "step": 15215 + }, + { + "epoch": 1.935631599033202, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.961950421333313, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8628085255622864, + "num_tokens": 580370017.0, + "step": 15216 + }, + { + "epoch": 1.9357588093117923, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8357484340667725, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8754969835281372, + "num_tokens": 580409572.0, + "step": 15217 + }, + { + "epoch": 1.9358860195903829, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8442994356155396, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.867807924747467, + "num_tokens": 580448481.0, + "step": 15218 + }, + { + "epoch": 1.9360132298689734, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8935472965240479, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8755244016647339, + "num_tokens": 580481919.0, + "step": 15219 + }, + { + "epoch": 1.936140440147564, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8668780326843262, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8582634925842285, + "num_tokens": 580520758.0, + "step": 15220 + }, + { + "epoch": 1.9362676504261545, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.9747257232666016, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8549603223800659, + "num_tokens": 580559839.0, + "step": 15221 + }, + { + "epoch": 1.936394860704745, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9526212215423584, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8890947103500366, + "num_tokens": 580591767.0, + "step": 15222 + }, + { + "epoch": 1.9365220709833355, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8776065111160278, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8722671866416931, + "num_tokens": 580633180.0, + "step": 15223 + }, + { + "epoch": 1.9366492812619258, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8148773908615112, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8658682107925415, + "num_tokens": 580677045.0, + "step": 15224 + }, + { + "epoch": 1.9367764915405163, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6923866271972656, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8759018182754517, + "num_tokens": 580717423.0, + "step": 15225 + }, + { + "epoch": 1.9369037018191069, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.4764716625213623, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8669726848602295, + "num_tokens": 580756677.0, + "step": 15226 + }, + { + "epoch": 1.9370309120976974, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9299815893173218, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8677849769592285, + "num_tokens": 580796993.0, + "step": 15227 + }, + { + "epoch": 1.937158122376288, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8518168926239014, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8667638897895813, + "num_tokens": 580834616.0, + "step": 15228 + }, + { + "epoch": 1.9372853326548785, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8918739557266235, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.865788459777832, + "num_tokens": 580873718.0, + "step": 15229 + }, + { + "epoch": 1.937412542933469, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9137121438980103, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8603804707527161, + "num_tokens": 580911678.0, + "step": 15230 + }, + { + "epoch": 1.9375397532120595, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7923003435134888, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8694626092910767, + "num_tokens": 580953282.0, + "step": 15231 + }, + { + "epoch": 1.93766696349065, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7220730781555176, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8678947687149048, + "num_tokens": 580997954.0, + "step": 15232 + }, + { + "epoch": 1.9377941737692406, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1140220165252686, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8641847968101501, + "num_tokens": 581035623.0, + "step": 15233 + }, + { + "epoch": 1.937921384047831, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.046344041824341, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8733221292495728, + "num_tokens": 581068727.0, + "step": 15234 + }, + { + "epoch": 1.9380485943264216, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.855523943901062, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.862734317779541, + "num_tokens": 581111180.0, + "step": 15235 + }, + { + "epoch": 1.9381758046050122, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0467183589935303, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8450505137443542, + "num_tokens": 581152373.0, + "step": 15236 + }, + { + "epoch": 1.9383030148836027, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7999377250671387, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8611559867858887, + "num_tokens": 581195854.0, + "step": 15237 + }, + { + "epoch": 1.9384302251621932, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1568288803100586, + "learning_rate": 1e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8353320360183716, + "num_tokens": 581236414.0, + "step": 15238 + }, + { + "epoch": 1.9385574354407837, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9936946630477905, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.848456621170044, + "num_tokens": 581275349.0, + "step": 15239 + }, + { + "epoch": 1.9386846457193743, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8451292514801025, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8709069490432739, + "num_tokens": 581313231.0, + "step": 15240 + }, + { + "epoch": 1.9388118559979648, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7773634195327759, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8636847734451294, + "num_tokens": 581355295.0, + "step": 15241 + }, + { + "epoch": 1.938939066276555, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9351037740707397, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8669359683990479, + "num_tokens": 581392633.0, + "step": 15242 + }, + { + "epoch": 1.9390662765551456, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.237825632095337, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8764234781265259, + "num_tokens": 581425552.0, + "step": 15243 + }, + { + "epoch": 1.9391934868337362, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9157391786575317, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8509687185287476, + "num_tokens": 581467707.0, + "step": 15244 + }, + { + "epoch": 1.9393206971123267, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8479063510894775, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8699476718902588, + "num_tokens": 581504888.0, + "step": 15245 + }, + { + "epoch": 1.9394479073909172, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9892940521240234, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8627673387527466, + "num_tokens": 581537800.0, + "step": 15246 + }, + { + "epoch": 1.9395751176695077, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.97444748878479, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8843706250190735, + "num_tokens": 581570594.0, + "step": 15247 + }, + { + "epoch": 1.939702327948098, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8391122817993164, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8549792170524597, + "num_tokens": 581610764.0, + "step": 15248 + }, + { + "epoch": 1.9398295382266886, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7924225330352783, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8638288974761963, + "num_tokens": 581652341.0, + "step": 15249 + }, + { + "epoch": 1.939956748505279, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.840279459953308, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8555096387863159, + "num_tokens": 581689978.0, + "step": 15250 + }, + { + "epoch": 1.9400839587838696, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.054147720336914, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8730227947235107, + "num_tokens": 581728811.0, + "step": 15251 + }, + { + "epoch": 1.9402111690624602, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9040443897247314, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.872948169708252, + "num_tokens": 581768016.0, + "step": 15252 + }, + { + "epoch": 1.9403383793410507, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8297491073608398, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8673093318939209, + "num_tokens": 581805405.0, + "step": 15253 + }, + { + "epoch": 1.9404655896196412, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9324458837509155, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8438138961791992, + "num_tokens": 581845302.0, + "step": 15254 + }, + { + "epoch": 1.9405927998982317, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0607757568359375, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.872287392616272, + "num_tokens": 581875391.0, + "step": 15255 + }, + { + "epoch": 1.9407200101768223, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9246482849121094, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.863655686378479, + "num_tokens": 581916580.0, + "step": 15256 + }, + { + "epoch": 1.9408472204554128, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.867416262626648, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8651694059371948, + "num_tokens": 581958539.0, + "step": 15257 + }, + { + "epoch": 1.9409744307340033, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.888565182685852, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8620891571044922, + "num_tokens": 581996981.0, + "step": 15258 + }, + { + "epoch": 1.9411016410125939, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9809987545013428, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8735288381576538, + "num_tokens": 582036068.0, + "step": 15259 + }, + { + "epoch": 1.9412288512911844, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8695218563079834, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8658543825149536, + "num_tokens": 582073322.0, + "step": 15260 + }, + { + "epoch": 1.941356061569775, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0848922729492188, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8662670254707336, + "num_tokens": 582110736.0, + "step": 15261 + }, + { + "epoch": 1.9414832718483654, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9111723899841309, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8462381362915039, + "num_tokens": 582153431.0, + "step": 15262 + }, + { + "epoch": 1.941610482126956, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1046500205993652, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8454307317733765, + "num_tokens": 582191793.0, + "step": 15263 + }, + { + "epoch": 1.9417376924055465, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9682708978652954, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8546752333641052, + "num_tokens": 582227923.0, + "step": 15264 + }, + { + "epoch": 1.941864902684137, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.013068675994873, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8471964597702026, + "num_tokens": 582268029.0, + "step": 15265 + }, + { + "epoch": 1.9419921129627273, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9426037073135376, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8575253486633301, + "num_tokens": 582302173.0, + "step": 15266 + }, + { + "epoch": 1.9421193232413179, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7585837841033936, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8793279528617859, + "num_tokens": 582341642.0, + "step": 15267 + }, + { + "epoch": 1.9422465335199084, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.7809441089630127, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8657300472259521, + "num_tokens": 582378960.0, + "step": 15268 + }, + { + "epoch": 1.942373743798499, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8948662281036377, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8704367876052856, + "num_tokens": 582422149.0, + "step": 15269 + }, + { + "epoch": 1.9425009540770894, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9134892225265503, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8488179445266724, + "num_tokens": 582462657.0, + "step": 15270 + }, + { + "epoch": 1.94262816435568, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8851053714752197, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8760913610458374, + "num_tokens": 582499264.0, + "step": 15271 + }, + { + "epoch": 1.9427553746342705, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9349172115325928, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8674201965332031, + "num_tokens": 582534744.0, + "step": 15272 + }, + { + "epoch": 1.9428825849128608, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7179417610168457, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.859713077545166, + "num_tokens": 582577010.0, + "step": 15273 + }, + { + "epoch": 1.9430097951914513, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.907832384109497, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8583186864852905, + "num_tokens": 582617103.0, + "step": 15274 + }, + { + "epoch": 1.9431370054700419, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8696304559707642, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8573683500289917, + "num_tokens": 582661754.0, + "step": 15275 + }, + { + "epoch": 1.9432642157486324, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9413621425628662, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8653398752212524, + "num_tokens": 582701701.0, + "step": 15276 + }, + { + "epoch": 1.943391426027223, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9333668947219849, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8542559742927551, + "num_tokens": 582741977.0, + "step": 15277 + }, + { + "epoch": 1.9435186363058135, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7451926469802856, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8737331628799438, + "num_tokens": 582783420.0, + "step": 15278 + }, + { + "epoch": 1.943645846584404, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8606703281402588, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8711551427841187, + "num_tokens": 582819464.0, + "step": 15279 + }, + { + "epoch": 1.9437730568629945, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8230863809585571, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8682191371917725, + "num_tokens": 582860757.0, + "step": 15280 + }, + { + "epoch": 1.943900267141585, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7390007972717285, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8581955432891846, + "num_tokens": 582910300.0, + "step": 15281 + }, + { + "epoch": 1.9440274774201756, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.816937804222107, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8573856353759766, + "num_tokens": 582956333.0, + "step": 15282 + }, + { + "epoch": 1.944154687698766, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8873969316482544, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8728352785110474, + "num_tokens": 582994068.0, + "step": 15283 + }, + { + "epoch": 1.9442818979773566, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1287248134613037, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8610137104988098, + "num_tokens": 583027829.0, + "step": 15284 + }, + { + "epoch": 1.9444091082559471, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0965020656585693, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8619581460952759, + "num_tokens": 583068881.0, + "step": 15285 + }, + { + "epoch": 1.9445363185345377, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0279455184936523, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8675245642662048, + "num_tokens": 583100097.0, + "step": 15286 + }, + { + "epoch": 1.9446635288131282, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9741748571395874, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8629416227340698, + "num_tokens": 583135340.0, + "step": 15287 + }, + { + "epoch": 1.9447907390917187, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9890968799591064, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.863323986530304, + "num_tokens": 583172032.0, + "step": 15288 + }, + { + "epoch": 1.9449179493703093, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8756399154663086, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8682848811149597, + "num_tokens": 583205244.0, + "step": 15289 + }, + { + "epoch": 1.9450451596488998, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7650744915008545, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.858736515045166, + "num_tokens": 583247537.0, + "step": 15290 + }, + { + "epoch": 1.94517236992749, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.989118218421936, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8579771518707275, + "num_tokens": 583285349.0, + "step": 15291 + }, + { + "epoch": 1.9452995802060806, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.829428791999817, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8660475015640259, + "num_tokens": 583323810.0, + "step": 15292 + }, + { + "epoch": 1.9454267904846712, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8775064945220947, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8761941194534302, + "num_tokens": 583354244.0, + "step": 15293 + }, + { + "epoch": 1.9455540007632617, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8311128616333008, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8786123394966125, + "num_tokens": 583391764.0, + "step": 15294 + }, + { + "epoch": 1.9456812110418522, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.902024507522583, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8521151542663574, + "num_tokens": 583426305.0, + "step": 15295 + }, + { + "epoch": 1.9458084213204427, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8603441715240479, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8603958487510681, + "num_tokens": 583465252.0, + "step": 15296 + }, + { + "epoch": 1.945935631599033, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.734859585762024, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8548763394355774, + "num_tokens": 583510001.0, + "step": 15297 + }, + { + "epoch": 1.9460628418776236, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.79221773147583, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8663377165794373, + "num_tokens": 583547380.0, + "step": 15298 + }, + { + "epoch": 1.946190052156214, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7528619766235352, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8648331165313721, + "num_tokens": 583588852.0, + "step": 15299 + }, + { + "epoch": 1.9463172624348046, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8467401266098022, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8693827390670776, + "num_tokens": 583634006.0, + "step": 15300 + }, + { + "epoch": 1.9464444727133952, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.99007248878479, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8491415977478027, + "num_tokens": 583679430.0, + "step": 15301 + }, + { + "epoch": 1.9465716829919857, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.036670446395874, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8697299957275391, + "num_tokens": 583708856.0, + "step": 15302 + }, + { + "epoch": 1.9466988932705762, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.036527633666992, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8623529672622681, + "num_tokens": 583750737.0, + "step": 15303 + }, + { + "epoch": 1.9468261035491667, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.046360492706299, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.868039608001709, + "num_tokens": 583786342.0, + "step": 15304 + }, + { + "epoch": 1.9469533138277573, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.219017267227173, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8690542578697205, + "num_tokens": 583821681.0, + "step": 15305 + }, + { + "epoch": 1.9470805241063478, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.065256118774414, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8442083597183228, + "num_tokens": 583855640.0, + "step": 15306 + }, + { + "epoch": 1.9472077343849383, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1626675128936768, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8727480173110962, + "num_tokens": 583888047.0, + "step": 15307 + }, + { + "epoch": 1.9473349446635289, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9956856966018677, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8636054992675781, + "num_tokens": 583923220.0, + "step": 15308 + }, + { + "epoch": 1.9474621549421194, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0763916969299316, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8595583438873291, + "num_tokens": 583965718.0, + "step": 15309 + }, + { + "epoch": 1.94758936522071, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.630367398262024, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8683022260665894, + "num_tokens": 584014027.0, + "step": 15310 + }, + { + "epoch": 1.9477165754993004, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2199792861938477, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8668999671936035, + "num_tokens": 584045019.0, + "step": 15311 + }, + { + "epoch": 1.947843785777891, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.911381721496582, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.847480297088623, + "num_tokens": 584086839.0, + "step": 15312 + }, + { + "epoch": 1.9479709960564815, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8968429565429688, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8787006139755249, + "num_tokens": 584123785.0, + "step": 15313 + }, + { + "epoch": 1.948098206335072, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8333852291107178, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8550362586975098, + "num_tokens": 584163768.0, + "step": 15314 + }, + { + "epoch": 1.9482254166136623, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9372891187667847, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8591567873954773, + "num_tokens": 584201668.0, + "step": 15315 + }, + { + "epoch": 1.9483526268922529, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9252947568893433, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8715236186981201, + "num_tokens": 584236626.0, + "step": 15316 + }, + { + "epoch": 1.9484798371708434, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8548147678375244, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8468800187110901, + "num_tokens": 584279811.0, + "step": 15317 + }, + { + "epoch": 1.948607047449434, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8903254270553589, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8637685775756836, + "num_tokens": 584317314.0, + "step": 15318 + }, + { + "epoch": 1.9487342577280244, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7756797075271606, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8693517446517944, + "num_tokens": 584355768.0, + "step": 15319 + }, + { + "epoch": 1.948861468006615, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0160787105560303, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8605297207832336, + "num_tokens": 584386205.0, + "step": 15320 + }, + { + "epoch": 1.9489886782852053, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.867072343826294, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8761624097824097, + "num_tokens": 584419721.0, + "step": 15321 + }, + { + "epoch": 1.9491158885637958, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1372265815734863, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8616679310798645, + "num_tokens": 584451029.0, + "step": 15322 + }, + { + "epoch": 1.9492430988423863, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9500080347061157, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8779516220092773, + "num_tokens": 584483991.0, + "step": 15323 + }, + { + "epoch": 1.9493703091209769, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0306732654571533, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8613077402114868, + "num_tokens": 584517652.0, + "step": 15324 + }, + { + "epoch": 1.9494975193995674, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.971663475036621, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.872288703918457, + "num_tokens": 584561058.0, + "step": 15325 + }, + { + "epoch": 1.949624729678158, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9505910873413086, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8670458197593689, + "num_tokens": 584595097.0, + "step": 15326 + }, + { + "epoch": 1.9497519399567484, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7439557313919067, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8714564442634583, + "num_tokens": 584639073.0, + "step": 15327 + }, + { + "epoch": 1.949879150235339, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7303348779678345, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8539457321166992, + "num_tokens": 584680517.0, + "step": 15328 + }, + { + "epoch": 1.9500063605139295, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2166969776153564, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8631867170333862, + "num_tokens": 584717796.0, + "step": 15329 + }, + { + "epoch": 1.95013357079252, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.825615406036377, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8764392137527466, + "num_tokens": 584755595.0, + "step": 15330 + }, + { + "epoch": 1.9502607810711106, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.024407148361206, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8530827760696411, + "num_tokens": 584798681.0, + "step": 15331 + }, + { + "epoch": 1.950387991349701, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.069225788116455, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8657076954841614, + "num_tokens": 584835424.0, + "step": 15332 + }, + { + "epoch": 1.9505152016282916, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8478343486785889, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8774768710136414, + "num_tokens": 584872063.0, + "step": 15333 + }, + { + "epoch": 1.9506424119068821, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.812467336654663, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8797548413276672, + "num_tokens": 584905850.0, + "step": 15334 + }, + { + "epoch": 1.9507696221854727, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8654693365097046, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8722543120384216, + "num_tokens": 584946480.0, + "step": 15335 + }, + { + "epoch": 1.9508968324640632, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0666990280151367, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8704766035079956, + "num_tokens": 584977382.0, + "step": 15336 + }, + { + "epoch": 1.9510240427426537, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.030174732208252, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8669843673706055, + "num_tokens": 585010885.0, + "step": 15337 + }, + { + "epoch": 1.9511512530212443, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9539170265197754, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8611929416656494, + "num_tokens": 585048208.0, + "step": 15338 + }, + { + "epoch": 1.9512784632998348, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8620414733886719, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8591955900192261, + "num_tokens": 585087477.0, + "step": 15339 + }, + { + "epoch": 1.951405673578425, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.978149175643921, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8629095554351807, + "num_tokens": 585120860.0, + "step": 15340 + }, + { + "epoch": 1.9515328838570156, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9109829664230347, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.865761399269104, + "num_tokens": 585161887.0, + "step": 15341 + }, + { + "epoch": 1.9516600941356061, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8377716541290283, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8726853132247925, + "num_tokens": 585200496.0, + "step": 15342 + }, + { + "epoch": 1.9517873044141967, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9156630039215088, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8648712635040283, + "num_tokens": 585243976.0, + "step": 15343 + }, + { + "epoch": 1.9519145146927872, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9700874090194702, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8592536449432373, + "num_tokens": 585282366.0, + "step": 15344 + }, + { + "epoch": 1.9520417249713777, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8851184844970703, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8640499711036682, + "num_tokens": 585325224.0, + "step": 15345 + }, + { + "epoch": 1.952168935249968, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9344093799591064, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8731228113174438, + "num_tokens": 585361954.0, + "step": 15346 + }, + { + "epoch": 1.9522961455285586, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9491233825683594, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.860511064529419, + "num_tokens": 585406033.0, + "step": 15347 + }, + { + "epoch": 1.952423355807149, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.074320077896118, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8641496896743774, + "num_tokens": 585440457.0, + "step": 15348 + }, + { + "epoch": 1.9525505660857396, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9538393020629883, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8868972063064575, + "num_tokens": 585473451.0, + "step": 15349 + }, + { + "epoch": 1.9526777763643302, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8599461317062378, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8684124946594238, + "num_tokens": 585509652.0, + "step": 15350 + }, + { + "epoch": 1.9528049866429207, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0345089435577393, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8689551949501038, + "num_tokens": 585545591.0, + "step": 15351 + }, + { + "epoch": 1.9529321969215112, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8743525743484497, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8526598811149597, + "num_tokens": 585583757.0, + "step": 15352 + }, + { + "epoch": 1.9530594072001017, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8184537887573242, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8535364866256714, + "num_tokens": 585623897.0, + "step": 15353 + }, + { + "epoch": 1.9531866174786923, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.846206784248352, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8676806092262268, + "num_tokens": 585661043.0, + "step": 15354 + }, + { + "epoch": 1.9533138277572828, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.011475086212158, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8381510972976685, + "num_tokens": 585699201.0, + "step": 15355 + }, + { + "epoch": 1.9534410380358733, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9139041900634766, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.857377290725708, + "num_tokens": 585735325.0, + "step": 15356 + }, + { + "epoch": 1.9535682483144639, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9669568538665771, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8725699186325073, + "num_tokens": 585769909.0, + "step": 15357 + }, + { + "epoch": 1.9536954585930544, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9517027139663696, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8533205389976501, + "num_tokens": 585804590.0, + "step": 15358 + }, + { + "epoch": 1.953822668871645, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3764076232910156, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8609057664871216, + "num_tokens": 585843349.0, + "step": 15359 + }, + { + "epoch": 1.9539498791502354, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.887231707572937, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8774070143699646, + "num_tokens": 585881846.0, + "step": 15360 + }, + { + "epoch": 1.954077089428826, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8999981880187988, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8614041805267334, + "num_tokens": 585917848.0, + "step": 15361 + }, + { + "epoch": 1.9542042997074165, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0879995822906494, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8694139122962952, + "num_tokens": 585961249.0, + "step": 15362 + }, + { + "epoch": 1.954331509986007, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7986958026885986, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8734843730926514, + "num_tokens": 586005795.0, + "step": 15363 + }, + { + "epoch": 1.9544587202645973, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.800879716873169, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8725448846817017, + "num_tokens": 586041523.0, + "step": 15364 + }, + { + "epoch": 1.9545859305431879, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0211338996887207, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8562085628509521, + "num_tokens": 586077800.0, + "step": 15365 + }, + { + "epoch": 1.9547131408217784, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9433650970458984, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8598794937133789, + "num_tokens": 586116010.0, + "step": 15366 + }, + { + "epoch": 1.954840351100369, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7948849201202393, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8768237829208374, + "num_tokens": 586155359.0, + "step": 15367 + }, + { + "epoch": 1.9549675613789594, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8415396213531494, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8676048517227173, + "num_tokens": 586188190.0, + "step": 15368 + }, + { + "epoch": 1.95509477165755, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9425461292266846, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8638991713523865, + "num_tokens": 586225971.0, + "step": 15369 + }, + { + "epoch": 1.9552219819361403, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9497787952423096, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8607565760612488, + "num_tokens": 586263295.0, + "step": 15370 + }, + { + "epoch": 1.9553491922147308, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1117630004882812, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8849201202392578, + "num_tokens": 586293547.0, + "step": 15371 + }, + { + "epoch": 1.9554764024933213, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1830239295959473, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8605304956436157, + "num_tokens": 586333180.0, + "step": 15372 + }, + { + "epoch": 1.9556036127719119, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8809099197387695, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8658610582351685, + "num_tokens": 586368780.0, + "step": 15373 + }, + { + "epoch": 1.9557308230505024, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9401118755340576, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8487757444381714, + "num_tokens": 586404177.0, + "step": 15374 + }, + { + "epoch": 1.955858033329093, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0393950939178467, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8497958779335022, + "num_tokens": 586446917.0, + "step": 15375 + }, + { + "epoch": 1.9559852436076834, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.414695978164673, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8647563457489014, + "num_tokens": 586485832.0, + "step": 15376 + }, + { + "epoch": 1.956112453886274, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9451584815979004, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.87509685754776, + "num_tokens": 586524089.0, + "step": 15377 + }, + { + "epoch": 1.9562396641648645, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8778239488601685, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8686030507087708, + "num_tokens": 586564097.0, + "step": 15378 + }, + { + "epoch": 1.956366874443455, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8719624280929565, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8768039345741272, + "num_tokens": 586600653.0, + "step": 15379 + }, + { + "epoch": 1.9564940847220456, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8865442276000977, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8704922795295715, + "num_tokens": 586639527.0, + "step": 15380 + }, + { + "epoch": 1.956621295000636, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2634055614471436, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8681753277778625, + "num_tokens": 586670885.0, + "step": 15381 + }, + { + "epoch": 1.9567485052792266, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0305898189544678, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8775320649147034, + "num_tokens": 586709875.0, + "step": 15382 + }, + { + "epoch": 1.9568757155578171, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9836785793304443, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8586470484733582, + "num_tokens": 586753164.0, + "step": 15383 + }, + { + "epoch": 1.9570029258364077, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8161402940750122, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.871905505657196, + "num_tokens": 586793737.0, + "step": 15384 + }, + { + "epoch": 1.9571301361149982, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8328527212142944, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8602907061576843, + "num_tokens": 586833428.0, + "step": 15385 + }, + { + "epoch": 1.9572573463935887, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.096249580383301, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8424273133277893, + "num_tokens": 586871202.0, + "step": 15386 + }, + { + "epoch": 1.9573845566721793, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8603122234344482, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8755431175231934, + "num_tokens": 586905537.0, + "step": 15387 + }, + { + "epoch": 1.9575117669507698, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.974016547203064, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.854012131690979, + "num_tokens": 586940378.0, + "step": 15388 + }, + { + "epoch": 1.95763897722936, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.155392646789551, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8751589059829712, + "num_tokens": 586978806.0, + "step": 15389 + }, + { + "epoch": 1.9577661875079506, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.314035415649414, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.860089898109436, + "num_tokens": 587007367.0, + "step": 15390 + }, + { + "epoch": 1.9578933977865411, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8295438289642334, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8868027925491333, + "num_tokens": 587040422.0, + "step": 15391 + }, + { + "epoch": 1.9580206080651317, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.175666093826294, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8831706047058105, + "num_tokens": 587070238.0, + "step": 15392 + }, + { + "epoch": 1.9581478183437222, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.811787486076355, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8719276189804077, + "num_tokens": 587105973.0, + "step": 15393 + }, + { + "epoch": 1.9582750286223127, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8273390531539917, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8778447508811951, + "num_tokens": 587137975.0, + "step": 15394 + }, + { + "epoch": 1.958402238900903, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.046135425567627, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8612679243087769, + "num_tokens": 587169176.0, + "step": 15395 + }, + { + "epoch": 1.9585294491794936, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.865402102470398, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.862516462802887, + "num_tokens": 587205570.0, + "step": 15396 + }, + { + "epoch": 1.958656659458084, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.934448480606079, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.857023298740387, + "num_tokens": 587246729.0, + "step": 15397 + }, + { + "epoch": 1.9587838697366746, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8381884098052979, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8683961629867554, + "num_tokens": 587284537.0, + "step": 15398 + }, + { + "epoch": 1.9589110800152651, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.075258255004883, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8694248795509338, + "num_tokens": 587324394.0, + "step": 15399 + }, + { + "epoch": 1.9590382902938557, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.96186363697052, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8729183077812195, + "num_tokens": 587359935.0, + "step": 15400 + }, + { + "epoch": 1.9591655005724462, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8403551578521729, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.874975323677063, + "num_tokens": 587402081.0, + "step": 15401 + }, + { + "epoch": 1.9592927108510367, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8645257949829102, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8624640107154846, + "num_tokens": 587442939.0, + "step": 15402 + }, + { + "epoch": 1.9594199211296273, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9269651174545288, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.856807291507721, + "num_tokens": 587479155.0, + "step": 15403 + }, + { + "epoch": 1.9595471314082178, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8655695915222168, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8534722328186035, + "num_tokens": 587519285.0, + "step": 15404 + }, + { + "epoch": 1.9596743416868083, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.890474796295166, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8467303514480591, + "num_tokens": 587563978.0, + "step": 15405 + }, + { + "epoch": 1.9598015519653988, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2063348293304443, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8595532774925232, + "num_tokens": 587599052.0, + "step": 15406 + }, + { + "epoch": 1.9599287622439894, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8163362741470337, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8683007955551147, + "num_tokens": 587639980.0, + "step": 15407 + }, + { + "epoch": 1.96005597252258, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9648993015289307, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.859769344329834, + "num_tokens": 587673836.0, + "step": 15408 + }, + { + "epoch": 1.9601831828011704, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0218777656555176, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.86436927318573, + "num_tokens": 587712896.0, + "step": 15409 + }, + { + "epoch": 1.960310393079761, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.00154185295105, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8575232625007629, + "num_tokens": 587747352.0, + "step": 15410 + }, + { + "epoch": 1.9604376033583515, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8168535232543945, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8638613224029541, + "num_tokens": 587790234.0, + "step": 15411 + }, + { + "epoch": 1.960564813636942, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.847890019416809, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8786035776138306, + "num_tokens": 587830185.0, + "step": 15412 + }, + { + "epoch": 1.9606920239155323, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0261223316192627, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8655380010604858, + "num_tokens": 587866265.0, + "step": 15413 + }, + { + "epoch": 1.9608192341941229, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8479117155075073, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8764870762825012, + "num_tokens": 587904956.0, + "step": 15414 + }, + { + "epoch": 1.9609464444727134, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8498082160949707, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8715640306472778, + "num_tokens": 587942369.0, + "step": 15415 + }, + { + "epoch": 1.961073654751304, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8430672883987427, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.869143545627594, + "num_tokens": 587979340.0, + "step": 15416 + }, + { + "epoch": 1.9612008650298944, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9470844268798828, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8599859476089478, + "num_tokens": 588021812.0, + "step": 15417 + }, + { + "epoch": 1.961328075308485, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8497254848480225, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8606814742088318, + "num_tokens": 588064978.0, + "step": 15418 + }, + { + "epoch": 1.9614552855870753, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9167563915252686, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8613954782485962, + "num_tokens": 588098142.0, + "step": 15419 + }, + { + "epoch": 1.9615824958656658, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6704787015914917, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8713473081588745, + "num_tokens": 588141808.0, + "step": 15420 + }, + { + "epoch": 1.9617097061442563, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.758854627609253, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8705794811248779, + "num_tokens": 588178137.0, + "step": 15421 + }, + { + "epoch": 1.9618369164228469, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0146238803863525, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8583378195762634, + "num_tokens": 588215646.0, + "step": 15422 + }, + { + "epoch": 1.9619641267014374, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.931462287902832, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8560447096824646, + "num_tokens": 588251345.0, + "step": 15423 + }, + { + "epoch": 1.962091336980028, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8334450721740723, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8705356121063232, + "num_tokens": 588291222.0, + "step": 15424 + }, + { + "epoch": 1.9622185472586184, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8486407995224, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8558031320571899, + "num_tokens": 588329480.0, + "step": 15425 + }, + { + "epoch": 1.962345757537209, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9807593822479248, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8565124273300171, + "num_tokens": 588367143.0, + "step": 15426 + }, + { + "epoch": 1.9624729678157995, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7843666076660156, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8742270469665527, + "num_tokens": 588408363.0, + "step": 15427 + }, + { + "epoch": 1.96260017809439, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9093852043151855, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8706018924713135, + "num_tokens": 588445397.0, + "step": 15428 + }, + { + "epoch": 1.9627273883729806, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1692347526550293, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8701701760292053, + "num_tokens": 588480436.0, + "step": 15429 + }, + { + "epoch": 1.962854598651571, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8337618112564087, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8640654683113098, + "num_tokens": 588526918.0, + "step": 15430 + }, + { + "epoch": 1.9629818089301616, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.028303861618042, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8659175634384155, + "num_tokens": 588561300.0, + "step": 15431 + }, + { + "epoch": 1.9631090192087521, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9205433130264282, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8556567430496216, + "num_tokens": 588603393.0, + "step": 15432 + }, + { + "epoch": 1.9632362294873427, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.910515546798706, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8725281357765198, + "num_tokens": 588643806.0, + "step": 15433 + }, + { + "epoch": 1.9633634397659332, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8306174278259277, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8630288243293762, + "num_tokens": 588685090.0, + "step": 15434 + }, + { + "epoch": 1.9634906500445237, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9801998138427734, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8527050018310547, + "num_tokens": 588720983.0, + "step": 15435 + }, + { + "epoch": 1.9636178603231143, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7818500995635986, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8588556051254272, + "num_tokens": 588760143.0, + "step": 15436 + }, + { + "epoch": 1.9637450706017048, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.889979362487793, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8655386567115784, + "num_tokens": 588796636.0, + "step": 15437 + }, + { + "epoch": 1.963872280880295, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.91798734664917, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8608442544937134, + "num_tokens": 588831681.0, + "step": 15438 + }, + { + "epoch": 1.9639994911588856, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.922409176826477, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8754342794418335, + "num_tokens": 588875408.0, + "step": 15439 + }, + { + "epoch": 1.9641267014374761, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8484212160110474, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8685813546180725, + "num_tokens": 588913583.0, + "step": 15440 + }, + { + "epoch": 1.9642539117160667, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8068530559539795, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8739223480224609, + "num_tokens": 588949833.0, + "step": 15441 + }, + { + "epoch": 1.9643811219946572, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9191317558288574, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8676688075065613, + "num_tokens": 588985678.0, + "step": 15442 + }, + { + "epoch": 1.9645083322732477, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.044459819793701, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8631581664085388, + "num_tokens": 589018276.0, + "step": 15443 + }, + { + "epoch": 1.964635542551838, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8520244359970093, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8627272248268127, + "num_tokens": 589061900.0, + "step": 15444 + }, + { + "epoch": 1.9647627528304286, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 3.0978479385375977, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.866497278213501, + "num_tokens": 589102740.0, + "step": 15445 + }, + { + "epoch": 1.964889963109019, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.861959457397461, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.874331533908844, + "num_tokens": 589135276.0, + "step": 15446 + }, + { + "epoch": 1.9650171733876096, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9560296535491943, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.864625871181488, + "num_tokens": 589173390.0, + "step": 15447 + }, + { + "epoch": 1.9651443836662001, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8507786989212036, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8581409454345703, + "num_tokens": 589214592.0, + "step": 15448 + }, + { + "epoch": 1.9652715939447907, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.052619457244873, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8653020858764648, + "num_tokens": 589250682.0, + "step": 15449 + }, + { + "epoch": 1.9653988042233812, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7877936363220215, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8595575094223022, + "num_tokens": 589288326.0, + "step": 15450 + }, + { + "epoch": 1.9655260145019717, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.911519169807434, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8614749908447266, + "num_tokens": 589321632.0, + "step": 15451 + }, + { + "epoch": 1.9656532247805623, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.947911024093628, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8547764420509338, + "num_tokens": 589361134.0, + "step": 15452 + }, + { + "epoch": 1.9657804350591528, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.776772975921631, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8677490949630737, + "num_tokens": 589402338.0, + "step": 15453 + }, + { + "epoch": 1.9659076453377433, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9822622537612915, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8584342002868652, + "num_tokens": 589441080.0, + "step": 15454 + }, + { + "epoch": 1.9660348556163338, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9578368663787842, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8618782162666321, + "num_tokens": 589484019.0, + "step": 15455 + }, + { + "epoch": 1.9661620658949244, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.922038197517395, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8592509031295776, + "num_tokens": 589525890.0, + "step": 15456 + }, + { + "epoch": 1.966289276173515, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8277442455291748, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8700377941131592, + "num_tokens": 589560644.0, + "step": 15457 + }, + { + "epoch": 1.9664164864521054, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9576430320739746, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8848507404327393, + "num_tokens": 589600863.0, + "step": 15458 + }, + { + "epoch": 1.966543696730696, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.024153709411621, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8630653619766235, + "num_tokens": 589637948.0, + "step": 15459 + }, + { + "epoch": 1.9666709070092865, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8992621898651123, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8782131671905518, + "num_tokens": 589678061.0, + "step": 15460 + }, + { + "epoch": 1.966798117287877, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8744540214538574, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8541146516799927, + "num_tokens": 589718050.0, + "step": 15461 + }, + { + "epoch": 1.9669253275664673, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.851088047027588, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8672407269477844, + "num_tokens": 589759206.0, + "step": 15462 + }, + { + "epoch": 1.9670525378450578, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9006012678146362, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8529136180877686, + "num_tokens": 589800218.0, + "step": 15463 + }, + { + "epoch": 1.9671797481236484, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7401292324066162, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8649954795837402, + "num_tokens": 589843105.0, + "step": 15464 + }, + { + "epoch": 1.967306958402239, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9413584470748901, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8560370206832886, + "num_tokens": 589881560.0, + "step": 15465 + }, + { + "epoch": 1.9674341686808294, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0173614025115967, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8542625308036804, + "num_tokens": 589922755.0, + "step": 15466 + }, + { + "epoch": 1.96756137895942, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1039350032806396, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8464379906654358, + "num_tokens": 589964982.0, + "step": 15467 + }, + { + "epoch": 1.9676885892380103, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8755472898483276, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8511873483657837, + "num_tokens": 590003371.0, + "step": 15468 + }, + { + "epoch": 1.9678157995166008, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.785796046257019, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8616312146186829, + "num_tokens": 590048361.0, + "step": 15469 + }, + { + "epoch": 1.9679430097951913, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9723999500274658, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8615310192108154, + "num_tokens": 590083141.0, + "step": 15470 + }, + { + "epoch": 1.9680702200737819, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8022854328155518, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8591169118881226, + "num_tokens": 590128161.0, + "step": 15471 + }, + { + "epoch": 1.9681974303523724, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7729425430297852, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8808145523071289, + "num_tokens": 590166101.0, + "step": 15472 + }, + { + "epoch": 1.968324640630963, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.782044768333435, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8628833293914795, + "num_tokens": 590207643.0, + "step": 15473 + }, + { + "epoch": 1.9684518509095534, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0160982608795166, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8575206398963928, + "num_tokens": 590247567.0, + "step": 15474 + }, + { + "epoch": 1.968579061188144, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8169887065887451, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8603432178497314, + "num_tokens": 590287940.0, + "step": 15475 + }, + { + "epoch": 1.9687062714667345, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9185824394226074, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8570542335510254, + "num_tokens": 590322558.0, + "step": 15476 + }, + { + "epoch": 1.968833481745325, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.004218339920044, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.862398087978363, + "num_tokens": 590362913.0, + "step": 15477 + }, + { + "epoch": 1.9689606920239155, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9271858930587769, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8552196025848389, + "num_tokens": 590405877.0, + "step": 15478 + }, + { + "epoch": 1.969087902302506, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.887937068939209, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8542395830154419, + "num_tokens": 590445373.0, + "step": 15479 + }, + { + "epoch": 1.9692151125810966, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8975036144256592, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8534418940544128, + "num_tokens": 590483319.0, + "step": 15480 + }, + { + "epoch": 1.9693423228596871, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7936267852783203, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8500736951828003, + "num_tokens": 590522433.0, + "step": 15481 + }, + { + "epoch": 1.9694695331382777, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 3.9565610885620117, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8688506484031677, + "num_tokens": 590564342.0, + "step": 15482 + }, + { + "epoch": 1.9695967434168682, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1626408100128174, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8658133745193481, + "num_tokens": 590594527.0, + "step": 15483 + }, + { + "epoch": 1.9697239536954587, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.085336208343506, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8750706315040588, + "num_tokens": 590630573.0, + "step": 15484 + }, + { + "epoch": 1.9698511639740492, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.830337405204773, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8653181195259094, + "num_tokens": 590669333.0, + "step": 15485 + }, + { + "epoch": 1.9699783742526398, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.78836989402771, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8797354102134705, + "num_tokens": 590709884.0, + "step": 15486 + }, + { + "epoch": 1.97010558453123, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8852120637893677, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8701653480529785, + "num_tokens": 590746109.0, + "step": 15487 + }, + { + "epoch": 1.9702327948098206, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7517485618591309, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8488517999649048, + "num_tokens": 590787881.0, + "step": 15488 + }, + { + "epoch": 1.9703600050884111, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0534446239471436, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8497587442398071, + "num_tokens": 590826861.0, + "step": 15489 + }, + { + "epoch": 1.9704872153670017, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.011472463607788, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8727964162826538, + "num_tokens": 590863104.0, + "step": 15490 + }, + { + "epoch": 1.9706144256455922, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9291338920593262, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8707125782966614, + "num_tokens": 590900582.0, + "step": 15491 + }, + { + "epoch": 1.9707416359241827, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.877976417541504, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8717198371887207, + "num_tokens": 590937735.0, + "step": 15492 + }, + { + "epoch": 1.970868846202773, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.807440161705017, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8784380555152893, + "num_tokens": 590973089.0, + "step": 15493 + }, + { + "epoch": 1.9709960564813636, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.910520076751709, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8492187261581421, + "num_tokens": 591012117.0, + "step": 15494 + }, + { + "epoch": 1.971123266759954, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.075317621231079, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8450964689254761, + "num_tokens": 591044978.0, + "step": 15495 + }, + { + "epoch": 1.9712504770385446, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8631459474563599, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8598529100418091, + "num_tokens": 591085562.0, + "step": 15496 + }, + { + "epoch": 1.9713776873171351, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9697163105010986, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8547557592391968, + "num_tokens": 591120140.0, + "step": 15497 + }, + { + "epoch": 1.9715048975957257, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.007699966430664, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.853263795375824, + "num_tokens": 591153952.0, + "step": 15498 + }, + { + "epoch": 1.9716321078743162, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7779107093811035, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8822249174118042, + "num_tokens": 591192983.0, + "step": 15499 + }, + { + "epoch": 1.9717593181529067, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1639368534088135, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8654919862747192, + "num_tokens": 591226260.0, + "step": 15500 + }, + { + "epoch": 1.9718865284314973, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0873453617095947, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8500586748123169, + "num_tokens": 591259361.0, + "step": 15501 + }, + { + "epoch": 1.9720137387100878, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8698148727416992, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8635594248771667, + "num_tokens": 591299753.0, + "step": 15502 + }, + { + "epoch": 1.9721409489886783, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.905869483947754, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8663272261619568, + "num_tokens": 591335008.0, + "step": 15503 + }, + { + "epoch": 1.9722681592672688, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.932529091835022, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8706444501876831, + "num_tokens": 591373001.0, + "step": 15504 + }, + { + "epoch": 1.9723953695458594, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.851792812347412, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8674423098564148, + "num_tokens": 591408637.0, + "step": 15505 + }, + { + "epoch": 1.97252257982445, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9106992483139038, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8606452941894531, + "num_tokens": 591444908.0, + "step": 15506 + }, + { + "epoch": 1.9726497901030404, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.900019884109497, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8659564256668091, + "num_tokens": 591484641.0, + "step": 15507 + }, + { + "epoch": 1.972777000381631, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9633764028549194, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8611714839935303, + "num_tokens": 591521859.0, + "step": 15508 + }, + { + "epoch": 1.9729042106602215, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.8443217277526855, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8635208606719971, + "num_tokens": 591559124.0, + "step": 15509 + }, + { + "epoch": 1.973031420938812, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8559678792953491, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8629746437072754, + "num_tokens": 591597170.0, + "step": 15510 + }, + { + "epoch": 1.9731586312174023, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.955156922340393, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8686951398849487, + "num_tokens": 591634784.0, + "step": 15511 + }, + { + "epoch": 1.9732858414959928, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.827516794204712, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8687214255332947, + "num_tokens": 591671904.0, + "step": 15512 + }, + { + "epoch": 1.9734130517745834, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8535431623458862, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8668593764305115, + "num_tokens": 591712990.0, + "step": 15513 + }, + { + "epoch": 1.973540262053174, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9753249883651733, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8613969087600708, + "num_tokens": 591748579.0, + "step": 15514 + }, + { + "epoch": 1.9736674723317644, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9890637397766113, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.85722815990448, + "num_tokens": 591785865.0, + "step": 15515 + }, + { + "epoch": 1.973794682610355, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7954883575439453, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8724768757820129, + "num_tokens": 591820087.0, + "step": 15516 + }, + { + "epoch": 1.9739218928889453, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8496696949005127, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.863724946975708, + "num_tokens": 591859713.0, + "step": 15517 + }, + { + "epoch": 1.9740491031675358, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.94770085811615, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8717670440673828, + "num_tokens": 591893142.0, + "step": 15518 + }, + { + "epoch": 1.9741763134461263, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8827612400054932, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8590281009674072, + "num_tokens": 591931529.0, + "step": 15519 + }, + { + "epoch": 1.9743035237247168, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.015306234359741, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8676680326461792, + "num_tokens": 591976248.0, + "step": 15520 + }, + { + "epoch": 1.9744307340033074, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.881358027458191, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8614040613174438, + "num_tokens": 592018008.0, + "step": 15521 + }, + { + "epoch": 1.974557944281898, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8674778938293457, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8496588468551636, + "num_tokens": 592062934.0, + "step": 15522 + }, + { + "epoch": 1.9746851545604884, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8390628099441528, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8615121245384216, + "num_tokens": 592103129.0, + "step": 15523 + }, + { + "epoch": 1.974812364839079, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1242659091949463, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8605615496635437, + "num_tokens": 592142116.0, + "step": 15524 + }, + { + "epoch": 1.9749395751176695, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7566075325012207, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8653663992881775, + "num_tokens": 592182507.0, + "step": 15525 + }, + { + "epoch": 1.97506678539626, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8912640810012817, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8424396514892578, + "num_tokens": 592225593.0, + "step": 15526 + }, + { + "epoch": 1.9751939956748505, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7170133590698242, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8887320756912231, + "num_tokens": 592270676.0, + "step": 15527 + }, + { + "epoch": 1.975321205953441, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9817860126495361, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8834460973739624, + "num_tokens": 592305537.0, + "step": 15528 + }, + { + "epoch": 1.9754484162320316, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9046865701675415, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8706246614456177, + "num_tokens": 592338289.0, + "step": 15529 + }, + { + "epoch": 1.9755756265106221, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.766181468963623, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8671786785125732, + "num_tokens": 592379189.0, + "step": 15530 + }, + { + "epoch": 1.9757028367892127, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8958544731140137, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8779718279838562, + "num_tokens": 592421875.0, + "step": 15531 + }, + { + "epoch": 1.9758300470678032, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9674625396728516, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8453563451766968, + "num_tokens": 592457438.0, + "step": 15532 + }, + { + "epoch": 1.9759572573463937, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9033677577972412, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8746839761734009, + "num_tokens": 592493230.0, + "step": 15533 + }, + { + "epoch": 1.9760844676249842, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0720834732055664, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8482418060302734, + "num_tokens": 592533656.0, + "step": 15534 + }, + { + "epoch": 1.9762116779035748, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8423460721969604, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8769660592079163, + "num_tokens": 592571622.0, + "step": 15535 + }, + { + "epoch": 1.976338888182165, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.96730637550354, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8583112955093384, + "num_tokens": 592613260.0, + "step": 15536 + }, + { + "epoch": 1.9764660984607556, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.90469491481781, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8620204925537109, + "num_tokens": 592648872.0, + "step": 15537 + }, + { + "epoch": 1.9765933087393461, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.268073797225952, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8655855655670166, + "num_tokens": 592682336.0, + "step": 15538 + }, + { + "epoch": 1.9767205190179367, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9175679683685303, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.877738893032074, + "num_tokens": 592716412.0, + "step": 15539 + }, + { + "epoch": 1.9768477292965272, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0184478759765625, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8657312393188477, + "num_tokens": 592753936.0, + "step": 15540 + }, + { + "epoch": 1.9769749395751177, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1132924556732178, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8647247552871704, + "num_tokens": 592792731.0, + "step": 15541 + }, + { + "epoch": 1.977102149853708, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.860661506652832, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8675967454910278, + "num_tokens": 592832035.0, + "step": 15542 + }, + { + "epoch": 1.9772293601322986, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9329816102981567, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8816124796867371, + "num_tokens": 592869693.0, + "step": 15543 + }, + { + "epoch": 1.977356570410889, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7154535055160522, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8678007125854492, + "num_tokens": 592909648.0, + "step": 15544 + }, + { + "epoch": 1.9774837806894796, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0575919151306152, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8663080930709839, + "num_tokens": 592942539.0, + "step": 15545 + }, + { + "epoch": 1.9776109909680701, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.812048077583313, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8632051348686218, + "num_tokens": 592983680.0, + "step": 15546 + }, + { + "epoch": 1.9777382012466607, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8717255592346191, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8544521331787109, + "num_tokens": 593023161.0, + "step": 15547 + }, + { + "epoch": 1.9778654115252512, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8048105239868164, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.846161961555481, + "num_tokens": 593066612.0, + "step": 15548 + }, + { + "epoch": 1.9779926218038417, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9696495532989502, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8508654832839966, + "num_tokens": 593105224.0, + "step": 15549 + }, + { + "epoch": 1.9781198320824323, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8599534034729004, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.863704264163971, + "num_tokens": 593143683.0, + "step": 15550 + }, + { + "epoch": 1.9782470423610228, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.947495698928833, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8497527241706848, + "num_tokens": 593179827.0, + "step": 15551 + }, + { + "epoch": 1.9783742526396133, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7995753288269043, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8647211790084839, + "num_tokens": 593220936.0, + "step": 15552 + }, + { + "epoch": 1.9785014629182038, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7953596115112305, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8593407273292542, + "num_tokens": 593258503.0, + "step": 15553 + }, + { + "epoch": 1.9786286731967944, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1162045001983643, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.844588041305542, + "num_tokens": 593293738.0, + "step": 15554 + }, + { + "epoch": 1.978755883475385, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8149272203445435, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8552920818328857, + "num_tokens": 593338037.0, + "step": 15555 + }, + { + "epoch": 1.9788830937539754, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8877614736557007, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.849417507648468, + "num_tokens": 593383509.0, + "step": 15556 + }, + { + "epoch": 1.979010304032566, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.89161217212677, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8674538135528564, + "num_tokens": 593417734.0, + "step": 15557 + }, + { + "epoch": 1.9791375143111565, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.773701548576355, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8754799365997314, + "num_tokens": 593454502.0, + "step": 15558 + }, + { + "epoch": 1.979264724589747, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9173105955123901, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8596292734146118, + "num_tokens": 593498864.0, + "step": 15559 + }, + { + "epoch": 1.9793919348683373, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9188412427902222, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8615875244140625, + "num_tokens": 593540125.0, + "step": 15560 + }, + { + "epoch": 1.9795191451469278, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1295430660247803, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8713480830192566, + "num_tokens": 593566937.0, + "step": 15561 + }, + { + "epoch": 1.9796463554255184, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9665964841842651, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8592363595962524, + "num_tokens": 593600109.0, + "step": 15562 + }, + { + "epoch": 1.979773565704109, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8203436136245728, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8629498481750488, + "num_tokens": 593638982.0, + "step": 15563 + }, + { + "epoch": 1.9799007759826994, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9794137477874756, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8542698621749878, + "num_tokens": 593677270.0, + "step": 15564 + }, + { + "epoch": 1.98002798626129, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.78023362159729, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.871640682220459, + "num_tokens": 593714056.0, + "step": 15565 + }, + { + "epoch": 1.9801551965398803, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8550453186035156, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8518551588058472, + "num_tokens": 593753516.0, + "step": 15566 + }, + { + "epoch": 1.9802824068184708, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9406620264053345, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8657601475715637, + "num_tokens": 593789276.0, + "step": 15567 + }, + { + "epoch": 1.9804096170970613, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.893254041671753, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8673385977745056, + "num_tokens": 593828112.0, + "step": 15568 + }, + { + "epoch": 1.9805368273756518, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8559465408325195, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8523526787757874, + "num_tokens": 593866370.0, + "step": 15569 + }, + { + "epoch": 1.9806640376542424, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.023937940597534, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8693351745605469, + "num_tokens": 593907863.0, + "step": 15570 + }, + { + "epoch": 1.980791247932833, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8206709623336792, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8751232624053955, + "num_tokens": 593945254.0, + "step": 15571 + }, + { + "epoch": 1.9809184582114234, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9144331216812134, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8691537380218506, + "num_tokens": 593987105.0, + "step": 15572 + }, + { + "epoch": 1.981045668490014, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7325758934020996, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8757612705230713, + "num_tokens": 594024091.0, + "step": 15573 + }, + { + "epoch": 1.9811728787686045, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9372437000274658, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8751107454299927, + "num_tokens": 594056018.0, + "step": 15574 + }, + { + "epoch": 1.981300089047195, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7118943929672241, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8675402402877808, + "num_tokens": 594102297.0, + "step": 15575 + }, + { + "epoch": 1.9814272993257855, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.882390022277832, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8655714988708496, + "num_tokens": 594140278.0, + "step": 15576 + }, + { + "epoch": 1.981554509604376, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9481271505355835, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8696498870849609, + "num_tokens": 594175025.0, + "step": 15577 + }, + { + "epoch": 1.9816817198829666, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7435898780822754, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8643008470535278, + "num_tokens": 594217906.0, + "step": 15578 + }, + { + "epoch": 1.9818089301615571, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4067368507385254, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8471389412879944, + "num_tokens": 594264331.0, + "step": 15579 + }, + { + "epoch": 1.9819361404401477, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.936466097831726, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8799417018890381, + "num_tokens": 594300220.0, + "step": 15580 + }, + { + "epoch": 1.9820633507187382, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8476295471191406, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8694027662277222, + "num_tokens": 594337849.0, + "step": 15581 + }, + { + "epoch": 1.9821905609973287, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8923327922821045, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8570719957351685, + "num_tokens": 594379295.0, + "step": 15582 + }, + { + "epoch": 1.9823177712759192, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.10528564453125, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8601022958755493, + "num_tokens": 594408350.0, + "step": 15583 + }, + { + "epoch": 1.9824449815545098, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.829014539718628, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8543331623077393, + "num_tokens": 594455110.0, + "step": 15584 + }, + { + "epoch": 1.9825721918331, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8154044151306152, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8675947189331055, + "num_tokens": 594499231.0, + "step": 15585 + }, + { + "epoch": 1.9826994021116906, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.838475227355957, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8450929522514343, + "num_tokens": 594541478.0, + "step": 15586 + }, + { + "epoch": 1.9828266123902811, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.967726469039917, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8668404817581177, + "num_tokens": 594579075.0, + "step": 15587 + }, + { + "epoch": 1.9829538226688717, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0008704662323, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8803682923316956, + "num_tokens": 594616189.0, + "step": 15588 + }, + { + "epoch": 1.9830810329474622, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9708333015441895, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.871220052242279, + "num_tokens": 594656633.0, + "step": 15589 + }, + { + "epoch": 1.9832082432260527, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8881067037582397, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8466930389404297, + "num_tokens": 594696833.0, + "step": 15590 + }, + { + "epoch": 1.983335453504643, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9650768041610718, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8520796298980713, + "num_tokens": 594735138.0, + "step": 15591 + }, + { + "epoch": 1.9834626637832335, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9970545768737793, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8588918447494507, + "num_tokens": 594769587.0, + "step": 15592 + }, + { + "epoch": 1.983589874061824, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8342503309249878, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8703446388244629, + "num_tokens": 594803635.0, + "step": 15593 + }, + { + "epoch": 1.9837170843404146, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.105455160140991, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8794028759002686, + "num_tokens": 594839872.0, + "step": 15594 + }, + { + "epoch": 1.9838442946190051, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9228808879852295, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8514294028282166, + "num_tokens": 594879208.0, + "step": 15595 + }, + { + "epoch": 1.9839715048975957, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9728552103042603, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8707730770111084, + "num_tokens": 594910377.0, + "step": 15596 + }, + { + "epoch": 1.9840987151761862, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8104678392410278, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8670321702957153, + "num_tokens": 594956574.0, + "step": 15597 + }, + { + "epoch": 1.9842259254547767, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7501150369644165, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8497551083564758, + "num_tokens": 594998036.0, + "step": 15598 + }, + { + "epoch": 1.9843531357333672, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8426538705825806, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8587806224822998, + "num_tokens": 595031404.0, + "step": 15599 + }, + { + "epoch": 1.9844803460119578, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8070307970046997, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8730480074882507, + "num_tokens": 595069800.0, + "step": 15600 + }, + { + "epoch": 1.9846075562905483, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0351877212524414, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8705143928527832, + "num_tokens": 595110987.0, + "step": 15601 + }, + { + "epoch": 1.9847347665691388, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8334728479385376, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8689671754837036, + "num_tokens": 595149192.0, + "step": 15602 + }, + { + "epoch": 1.9848619768477294, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.100207805633545, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8618462085723877, + "num_tokens": 595182993.0, + "step": 15603 + }, + { + "epoch": 1.9849891871263199, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9981780052185059, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8624635934829712, + "num_tokens": 595216177.0, + "step": 15604 + }, + { + "epoch": 1.9851163974049104, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.93747878074646, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8656992316246033, + "num_tokens": 595253379.0, + "step": 15605 + }, + { + "epoch": 1.985243607683501, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.8787145614624023, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8682301640510559, + "num_tokens": 595292878.0, + "step": 15606 + }, + { + "epoch": 1.9853708179620915, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9718880653381348, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8649712800979614, + "num_tokens": 595329082.0, + "step": 15607 + }, + { + "epoch": 1.985498028240682, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.086784601211548, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8559131622314453, + "num_tokens": 595367246.0, + "step": 15608 + }, + { + "epoch": 1.9856252385192723, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8257429599761963, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8713163137435913, + "num_tokens": 595404450.0, + "step": 15609 + }, + { + "epoch": 1.9857524487978628, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9128167629241943, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8772669434547424, + "num_tokens": 595439718.0, + "step": 15610 + }, + { + "epoch": 1.9858796590764534, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9509378671646118, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8671848773956299, + "num_tokens": 595473617.0, + "step": 15611 + }, + { + "epoch": 1.986006869355044, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7561759948730469, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8617349863052368, + "num_tokens": 595516432.0, + "step": 15612 + }, + { + "epoch": 1.9861340796336344, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.839130163192749, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8659159541130066, + "num_tokens": 595556278.0, + "step": 15613 + }, + { + "epoch": 1.986261289912225, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7545816898345947, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8704673647880554, + "num_tokens": 595598713.0, + "step": 15614 + }, + { + "epoch": 1.9863885001908153, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0421864986419678, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8665010929107666, + "num_tokens": 595630597.0, + "step": 15615 + }, + { + "epoch": 1.9865157104694058, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8521900177001953, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8833206295967102, + "num_tokens": 595664163.0, + "step": 15616 + }, + { + "epoch": 1.9866429207479963, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9262679815292358, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8740619421005249, + "num_tokens": 595698809.0, + "step": 15617 + }, + { + "epoch": 1.9867701310265868, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8883475065231323, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8592565655708313, + "num_tokens": 595741166.0, + "step": 15618 + }, + { + "epoch": 1.9868973413051774, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9369229078292847, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8594250679016113, + "num_tokens": 595787079.0, + "step": 15619 + }, + { + "epoch": 1.987024551583768, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8927838802337646, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8405566215515137, + "num_tokens": 595829085.0, + "step": 15620 + }, + { + "epoch": 1.9871517618623584, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9374120235443115, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8630326390266418, + "num_tokens": 595870646.0, + "step": 15621 + }, + { + "epoch": 1.987278972140949, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7564600706100464, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8540604114532471, + "num_tokens": 595918473.0, + "step": 15622 + }, + { + "epoch": 1.9874061824195395, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9662748575210571, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8532933592796326, + "num_tokens": 595954097.0, + "step": 15623 + }, + { + "epoch": 1.98753339269813, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8284807205200195, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8457859754562378, + "num_tokens": 595990191.0, + "step": 15624 + }, + { + "epoch": 1.9876606029767205, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8759188652038574, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8565517067909241, + "num_tokens": 596027109.0, + "step": 15625 + }, + { + "epoch": 1.987787813255311, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7703819274902344, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8724148273468018, + "num_tokens": 596067930.0, + "step": 15626 + }, + { + "epoch": 1.9879150235339016, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3531525135040283, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8754849433898926, + "num_tokens": 596105809.0, + "step": 15627 + }, + { + "epoch": 1.9880422338124921, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0019278526306152, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8680264949798584, + "num_tokens": 596144220.0, + "step": 15628 + }, + { + "epoch": 1.9881694440910826, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0925302505493164, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8412984609603882, + "num_tokens": 596183825.0, + "step": 15629 + }, + { + "epoch": 1.9882966543696732, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6285059452056885, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8736169338226318, + "num_tokens": 596232333.0, + "step": 15630 + }, + { + "epoch": 1.9884238646482637, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8867162466049194, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8735993504524231, + "num_tokens": 596272138.0, + "step": 15631 + }, + { + "epoch": 1.9885510749268542, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9265824556350708, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8745414018630981, + "num_tokens": 596310254.0, + "step": 15632 + }, + { + "epoch": 1.9886782852054448, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8274340629577637, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8618886470794678, + "num_tokens": 596350396.0, + "step": 15633 + }, + { + "epoch": 1.988805495484035, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7800897359848022, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.867611825466156, + "num_tokens": 596388809.0, + "step": 15634 + }, + { + "epoch": 1.9889327057626256, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7932417392730713, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8716579079627991, + "num_tokens": 596427366.0, + "step": 15635 + }, + { + "epoch": 1.9890599160412161, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7752622365951538, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8561908006668091, + "num_tokens": 596469999.0, + "step": 15636 + }, + { + "epoch": 1.9891871263198067, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9482969045639038, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8491078019142151, + "num_tokens": 596506773.0, + "step": 15637 + }, + { + "epoch": 1.9893143365983972, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2715415954589844, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8672454357147217, + "num_tokens": 596547440.0, + "step": 15638 + }, + { + "epoch": 1.9894415468769877, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9721033573150635, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8722772002220154, + "num_tokens": 596580212.0, + "step": 15639 + }, + { + "epoch": 1.989568757155578, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9280757904052734, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.88642418384552, + "num_tokens": 596611790.0, + "step": 15640 + }, + { + "epoch": 1.9896959674341685, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8397234678268433, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8816149234771729, + "num_tokens": 596646984.0, + "step": 15641 + }, + { + "epoch": 1.989823177712759, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.1798298358917236, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8606613278388977, + "num_tokens": 596688362.0, + "step": 15642 + }, + { + "epoch": 1.9899503879913496, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9843921661376953, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8673995137214661, + "num_tokens": 596725298.0, + "step": 15643 + }, + { + "epoch": 1.9900775982699401, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.005650043487549, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8703210353851318, + "num_tokens": 596757400.0, + "step": 15644 + }, + { + "epoch": 1.9902048085485307, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.951493263244629, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8768690824508667, + "num_tokens": 596790533.0, + "step": 15645 + }, + { + "epoch": 1.9903320188271212, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0981080532073975, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8593990206718445, + "num_tokens": 596834467.0, + "step": 15646 + }, + { + "epoch": 1.9904592291057117, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.840772032737732, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8525570631027222, + "num_tokens": 596873262.0, + "step": 15647 + }, + { + "epoch": 1.9905864393843022, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8897480964660645, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8656196594238281, + "num_tokens": 596912833.0, + "step": 15648 + }, + { + "epoch": 1.9907136496628928, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.827916145324707, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8724654316902161, + "num_tokens": 596953554.0, + "step": 15649 + }, + { + "epoch": 1.9908408599414833, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.011577844619751, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8594232201576233, + "num_tokens": 596991206.0, + "step": 15650 + }, + { + "epoch": 1.9909680702200738, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.879395604133606, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8677904605865479, + "num_tokens": 597027222.0, + "step": 15651 + }, + { + "epoch": 1.9910952804986644, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8925068378448486, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.873910665512085, + "num_tokens": 597069040.0, + "step": 15652 + }, + { + "epoch": 1.9912224907772549, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8734592199325562, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8573915958404541, + "num_tokens": 597112529.0, + "step": 15653 + }, + { + "epoch": 1.9913497010558454, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.899711012840271, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8653508424758911, + "num_tokens": 597148304.0, + "step": 15654 + }, + { + "epoch": 1.991476911334436, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.923970103263855, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8736721277236938, + "num_tokens": 597184244.0, + "step": 15655 + }, + { + "epoch": 1.9916041216130265, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.225830554962158, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8585774898529053, + "num_tokens": 597218178.0, + "step": 15656 + }, + { + "epoch": 1.991731331891617, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0336368083953857, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8625873923301697, + "num_tokens": 597259256.0, + "step": 15657 + }, + { + "epoch": 1.9918585421702073, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8399581909179688, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8755200505256653, + "num_tokens": 597296584.0, + "step": 15658 + }, + { + "epoch": 1.9919857524487978, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.820214867591858, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8489514589309692, + "num_tokens": 597341116.0, + "step": 15659 + }, + { + "epoch": 1.9921129627273884, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0935070514678955, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.86292964220047, + "num_tokens": 597375595.0, + "step": 15660 + }, + { + "epoch": 1.9922401730059789, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9286448955535889, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8499917984008789, + "num_tokens": 597419226.0, + "step": 15661 + }, + { + "epoch": 1.9923673832845694, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.843809962272644, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8458298444747925, + "num_tokens": 597461242.0, + "step": 15662 + }, + { + "epoch": 1.99249459356316, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.892510175704956, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8650690317153931, + "num_tokens": 597500030.0, + "step": 15663 + }, + { + "epoch": 1.9926218038417502, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9570807218551636, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8488439321517944, + "num_tokens": 597540428.0, + "step": 15664 + }, + { + "epoch": 1.9927490141203408, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.968410849571228, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8748247623443604, + "num_tokens": 597575070.0, + "step": 15665 + }, + { + "epoch": 1.9928762243989313, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9666426181793213, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8627324104309082, + "num_tokens": 597617425.0, + "step": 15666 + }, + { + "epoch": 1.9930034346775218, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8919835090637207, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8713730573654175, + "num_tokens": 597660489.0, + "step": 15667 + }, + { + "epoch": 1.9931306449561124, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6531851291656494, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8702722787857056, + "num_tokens": 597703626.0, + "step": 15668 + }, + { + "epoch": 1.993257855234703, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8701817989349365, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8609545230865479, + "num_tokens": 597741176.0, + "step": 15669 + }, + { + "epoch": 1.9933850655132934, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9418902397155762, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8519726991653442, + "num_tokens": 597779763.0, + "step": 15670 + }, + { + "epoch": 1.993512275791884, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8685333728790283, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8683831691741943, + "num_tokens": 597813300.0, + "step": 15671 + }, + { + "epoch": 1.9936394860704745, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9145258665084839, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8545059561729431, + "num_tokens": 597849062.0, + "step": 15672 + }, + { + "epoch": 1.993766696349065, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8704739809036255, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8656084537506104, + "num_tokens": 597886565.0, + "step": 15673 + }, + { + "epoch": 1.9938939066276555, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.7147204875946045, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8660323023796082, + "num_tokens": 597925097.0, + "step": 15674 + }, + { + "epoch": 1.994021116906246, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.060612678527832, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8769122362136841, + "num_tokens": 597960786.0, + "step": 15675 + }, + { + "epoch": 1.9941483271848366, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.24041485786438, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8463178277015686, + "num_tokens": 598000442.0, + "step": 15676 + }, + { + "epoch": 1.9942755374634271, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7177070379257202, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8619816899299622, + "num_tokens": 598042946.0, + "step": 15677 + }, + { + "epoch": 1.9944027477420176, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1586062908172607, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8431128859519958, + "num_tokens": 598081659.0, + "step": 15678 + }, + { + "epoch": 1.9945299580206082, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0650486946105957, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8588235378265381, + "num_tokens": 598120203.0, + "step": 15679 + }, + { + "epoch": 1.9946571682991987, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9301272630691528, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8507161140441895, + "num_tokens": 598154779.0, + "step": 15680 + }, + { + "epoch": 1.9947843785777892, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.800299048423767, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8646646738052368, + "num_tokens": 598197075.0, + "step": 15681 + }, + { + "epoch": 1.9949115888563798, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.894968867301941, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8557624816894531, + "num_tokens": 598240539.0, + "step": 15682 + }, + { + "epoch": 1.99503879913497, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.778990626335144, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8654544949531555, + "num_tokens": 598281776.0, + "step": 15683 + }, + { + "epoch": 1.9951660094135606, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7064520120620728, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8706235885620117, + "num_tokens": 598322715.0, + "step": 15684 + }, + { + "epoch": 1.9952932196921511, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.755375385284424, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.857714056968689, + "num_tokens": 598364495.0, + "step": 15685 + }, + { + "epoch": 1.9954204299707416, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8442301750183105, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.860649824142456, + "num_tokens": 598407102.0, + "step": 15686 + }, + { + "epoch": 1.9955476402493322, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9451757669448853, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8495273590087891, + "num_tokens": 598448600.0, + "step": 15687 + }, + { + "epoch": 1.9956748505279227, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7645827531814575, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8533713817596436, + "num_tokens": 598491140.0, + "step": 15688 + }, + { + "epoch": 1.995802060806513, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9653652906417847, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8590651750564575, + "num_tokens": 598527831.0, + "step": 15689 + }, + { + "epoch": 1.9959292710851035, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.906121850013733, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8504383563995361, + "num_tokens": 598568680.0, + "step": 15690 + }, + { + "epoch": 1.996056481363694, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9141018390655518, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8527442812919617, + "num_tokens": 598606254.0, + "step": 15691 + }, + { + "epoch": 1.9961836916422846, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.8615773916244507, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8656660318374634, + "num_tokens": 598642400.0, + "step": 15692 + }, + { + "epoch": 1.9963109019208751, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8520586490631104, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8421840667724609, + "num_tokens": 598678379.0, + "step": 15693 + }, + { + "epoch": 1.9964381121994657, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9025473594665527, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8624751567840576, + "num_tokens": 598718234.0, + "step": 15694 + }, + { + "epoch": 1.9965653224780562, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.0443665981292725, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8673409223556519, + "num_tokens": 598754897.0, + "step": 15695 + }, + { + "epoch": 1.9966925327566467, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0020334720611572, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8579341173171997, + "num_tokens": 598785268.0, + "step": 15696 + }, + { + "epoch": 1.9968197430352372, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7685058116912842, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8696431517601013, + "num_tokens": 598827131.0, + "step": 15697 + }, + { + "epoch": 1.9969469533138278, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9799379110336304, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8643132448196411, + "num_tokens": 598859507.0, + "step": 15698 + }, + { + "epoch": 1.9970741635924183, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9236724376678467, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8548623919487, + "num_tokens": 598893280.0, + "step": 15699 + }, + { + "epoch": 1.9972013738710088, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.843284010887146, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8774230480194092, + "num_tokens": 598932759.0, + "step": 15700 + }, + { + "epoch": 1.9973285841495994, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.767200231552124, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8733725547790527, + "num_tokens": 598972899.0, + "step": 15701 + }, + { + "epoch": 1.9974557944281899, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8213121891021729, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8649119734764099, + "num_tokens": 599012811.0, + "step": 15702 + }, + { + "epoch": 1.9975830047067804, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8256456851959229, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8542691469192505, + "num_tokens": 599055632.0, + "step": 15703 + }, + { + "epoch": 1.997710214985371, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9842040538787842, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8471146821975708, + "num_tokens": 599090924.0, + "step": 15704 + }, + { + "epoch": 1.9978374252639615, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8100224733352661, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8773312568664551, + "num_tokens": 599130434.0, + "step": 15705 + }, + { + "epoch": 1.997964635542552, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.981438398361206, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8629398345947266, + "num_tokens": 599171018.0, + "step": 15706 + }, + { + "epoch": 1.9980918458211423, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8120125532150269, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.85564786195755, + "num_tokens": 599212445.0, + "step": 15707 + }, + { + "epoch": 1.9982190560997328, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.836868405342102, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8671749830245972, + "num_tokens": 599250591.0, + "step": 15708 + }, + { + "epoch": 1.9983462663783234, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8900991678237915, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8592734336853027, + "num_tokens": 599284987.0, + "step": 15709 + }, + { + "epoch": 1.9984734766569139, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9437321424484253, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.859222412109375, + "num_tokens": 599324656.0, + "step": 15710 + }, + { + "epoch": 1.9986006869355044, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8990365266799927, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8743753433227539, + "num_tokens": 599364014.0, + "step": 15711 + }, + { + "epoch": 1.998727897214095, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.585111141204834, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.856819748878479, + "num_tokens": 599396781.0, + "step": 15712 + }, + { + "epoch": 1.9988551074926852, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.027438163757324, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8735815286636353, + "num_tokens": 599433240.0, + "step": 15713 + }, + { + "epoch": 1.9989823177712758, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9004594087600708, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8707442283630371, + "num_tokens": 599474883.0, + "step": 15714 + }, + { + "epoch": 1.9991095280498663, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.977597951889038, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8640410900115967, + "num_tokens": 599508492.0, + "step": 15715 + }, + { + "epoch": 1.9992367383284568, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.794732928276062, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8654816150665283, + "num_tokens": 599545946.0, + "step": 15716 + }, + { + "epoch": 1.9993639486070474, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7604038715362549, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8769204616546631, + "num_tokens": 599586217.0, + "step": 15717 + }, + { + "epoch": 1.9994911588856379, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8849560022354126, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.872846245765686, + "num_tokens": 599620431.0, + "step": 15718 + }, + { + "epoch": 1.9996183691642284, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9752912521362305, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8485195636749268, + "num_tokens": 599659959.0, + "step": 15719 + }, + { + "epoch": 1.999745579442819, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.010185480117798, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8607797026634216, + "num_tokens": 599696558.0, + "step": 15720 + }, + { + "epoch": 1.9998727897214095, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.801822304725647, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8524726629257202, + "num_tokens": 599734925.0, + "step": 15721 + }, + { + "epoch": 2.0, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.828540325164795, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.866943359375, + "num_tokens": 599772613.0, + "step": 15722 + }, + { + "epoch": 2.0001272102785905, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7312431335449219, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8673052787780762, + "num_tokens": 599813892.0, + "step": 15723 + }, + { + "epoch": 2.000254420557181, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.0878522396087646, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8662583827972412, + "num_tokens": 599852807.0, + "step": 15724 + }, + { + "epoch": 2.0003816308357716, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9467806816101074, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8717482089996338, + "num_tokens": 599893712.0, + "step": 15725 + }, + { + "epoch": 2.000508841114362, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.223162889480591, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8666222095489502, + "num_tokens": 599929954.0, + "step": 15726 + }, + { + "epoch": 2.0006360513929526, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8125942945480347, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8643074035644531, + "num_tokens": 599967763.0, + "step": 15727 + }, + { + "epoch": 2.000763261671543, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 16.96582794189453, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.874416708946228, + "num_tokens": 600008938.0, + "step": 15728 + }, + { + "epoch": 2.0008904719501337, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.040966033935547, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8618039488792419, + "num_tokens": 600051849.0, + "step": 15729 + }, + { + "epoch": 2.0010176822287242, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.2076005935668945, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8722195625305176, + "num_tokens": 600088568.0, + "step": 15730 + }, + { + "epoch": 2.0011448925073148, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9819204807281494, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8700941801071167, + "num_tokens": 600131553.0, + "step": 15731 + }, + { + "epoch": 2.0012721027859053, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8436404466629028, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.865656316280365, + "num_tokens": 600173198.0, + "step": 15732 + }, + { + "epoch": 2.001399313064496, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7929021120071411, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8700138330459595, + "num_tokens": 600211347.0, + "step": 15733 + }, + { + "epoch": 2.0015265233430863, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.850496768951416, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8592915534973145, + "num_tokens": 600249953.0, + "step": 15734 + }, + { + "epoch": 2.0016537336216764, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9373756647109985, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8573167324066162, + "num_tokens": 600291211.0, + "step": 15735 + }, + { + "epoch": 2.001780943900267, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8072627782821655, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8704960346221924, + "num_tokens": 600332906.0, + "step": 15736 + }, + { + "epoch": 2.0019081541788575, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.911584496498108, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8669056296348572, + "num_tokens": 600370570.0, + "step": 15737 + }, + { + "epoch": 2.002035364457448, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.6174373626709, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8562260270118713, + "num_tokens": 600409368.0, + "step": 15738 + }, + { + "epoch": 2.0021625747360385, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.098351240158081, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8698078989982605, + "num_tokens": 600442019.0, + "step": 15739 + }, + { + "epoch": 2.002289785014629, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2590115070343018, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8736803531646729, + "num_tokens": 600471665.0, + "step": 15740 + }, + { + "epoch": 2.0024169952932196, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1583330631256104, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8635163903236389, + "num_tokens": 600508804.0, + "step": 15741 + }, + { + "epoch": 2.00254420557181, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.994202733039856, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8530248999595642, + "num_tokens": 600543895.0, + "step": 15742 + }, + { + "epoch": 2.0026714158504006, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.85377037525177, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8677414655685425, + "num_tokens": 600585818.0, + "step": 15743 + }, + { + "epoch": 2.002798626128991, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.027960777282715, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8691809773445129, + "num_tokens": 600623106.0, + "step": 15744 + }, + { + "epoch": 2.0029258364075817, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8836675882339478, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.887373685836792, + "num_tokens": 600662662.0, + "step": 15745 + }, + { + "epoch": 2.0030530466861722, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.050840377807617, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8526824712753296, + "num_tokens": 600699416.0, + "step": 15746 + }, + { + "epoch": 2.0031802569647628, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.006246328353882, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8592733144760132, + "num_tokens": 600739351.0, + "step": 15747 + }, + { + "epoch": 2.0033074672433533, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.829573392868042, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8686676025390625, + "num_tokens": 600781198.0, + "step": 15748 + }, + { + "epoch": 2.003434677521944, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9968334436416626, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8633330464363098, + "num_tokens": 600821743.0, + "step": 15749 + }, + { + "epoch": 2.0035618878005343, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9148504734039307, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8798267245292664, + "num_tokens": 600862463.0, + "step": 15750 + }, + { + "epoch": 2.003689098079125, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9935282468795776, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8561795949935913, + "num_tokens": 600902629.0, + "step": 15751 + }, + { + "epoch": 2.0038163083577154, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8470789194107056, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8640594482421875, + "num_tokens": 600941778.0, + "step": 15752 + }, + { + "epoch": 2.003943518636306, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9020195007324219, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8742257356643677, + "num_tokens": 600979425.0, + "step": 15753 + }, + { + "epoch": 2.0040707289148965, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.942009449005127, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.861259937286377, + "num_tokens": 601017882.0, + "step": 15754 + }, + { + "epoch": 2.004197939193487, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0956873893737793, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8615347146987915, + "num_tokens": 601053394.0, + "step": 15755 + }, + { + "epoch": 2.0043251494720775, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.880528450012207, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.86887127161026, + "num_tokens": 601094940.0, + "step": 15756 + }, + { + "epoch": 2.004452359750668, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9959050416946411, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8675005435943604, + "num_tokens": 601132963.0, + "step": 15757 + }, + { + "epoch": 2.0045795700292586, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0223805904388428, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8642451763153076, + "num_tokens": 601168511.0, + "step": 15758 + }, + { + "epoch": 2.0047067803078487, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8098379373550415, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8676371574401855, + "num_tokens": 601208922.0, + "step": 15759 + }, + { + "epoch": 2.004833990586439, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.868515133857727, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8606172204017639, + "num_tokens": 601253814.0, + "step": 15760 + }, + { + "epoch": 2.0049612008650297, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1339237689971924, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8677721619606018, + "num_tokens": 601295584.0, + "step": 15761 + }, + { + "epoch": 2.0050884111436202, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9921238422393799, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8587024211883545, + "num_tokens": 601329134.0, + "step": 15762 + }, + { + "epoch": 2.0052156214222108, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7968077659606934, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8741691708564758, + "num_tokens": 601369100.0, + "step": 15763 + }, + { + "epoch": 2.0053428317008013, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.9576430320739746, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.880318820476532, + "num_tokens": 601407876.0, + "step": 15764 + }, + { + "epoch": 2.005470041979392, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9825820922851562, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8788129687309265, + "num_tokens": 601446996.0, + "step": 15765 + }, + { + "epoch": 2.0055972522579824, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.095778226852417, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8572729825973511, + "num_tokens": 601482158.0, + "step": 15766 + }, + { + "epoch": 2.005724462536573, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9101160764694214, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8726974725723267, + "num_tokens": 601517940.0, + "step": 15767 + }, + { + "epoch": 2.0058516728151634, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8557276725769043, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8862819671630859, + "num_tokens": 601553823.0, + "step": 15768 + }, + { + "epoch": 2.005978883093754, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8662797212600708, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8540986776351929, + "num_tokens": 601595482.0, + "step": 15769 + }, + { + "epoch": 2.0061060933723445, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9641057252883911, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8741878271102905, + "num_tokens": 601632451.0, + "step": 15770 + }, + { + "epoch": 2.006233303650935, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9563990831375122, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8638916015625, + "num_tokens": 601667573.0, + "step": 15771 + }, + { + "epoch": 2.0063605139295255, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 16.614307403564453, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8734692335128784, + "num_tokens": 601704494.0, + "step": 15772 + }, + { + "epoch": 2.006487724208116, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0061392784118652, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8545851707458496, + "num_tokens": 601745307.0, + "step": 15773 + }, + { + "epoch": 2.0066149344867066, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0181329250335693, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8810870051383972, + "num_tokens": 601782551.0, + "step": 15774 + }, + { + "epoch": 2.006742144765297, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.160715341567993, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8684409856796265, + "num_tokens": 601823378.0, + "step": 15775 + }, + { + "epoch": 2.0068693550438876, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8437480926513672, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.867576003074646, + "num_tokens": 601860881.0, + "step": 15776 + }, + { + "epoch": 2.006996565322478, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8977019786834717, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8640875816345215, + "num_tokens": 601903502.0, + "step": 15777 + }, + { + "epoch": 2.0071237756010687, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.842203140258789, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8852728605270386, + "num_tokens": 601937097.0, + "step": 15778 + }, + { + "epoch": 2.007250985879659, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9372135400772095, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8703386187553406, + "num_tokens": 601978050.0, + "step": 15779 + }, + { + "epoch": 2.0073781961582498, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.836059808731079, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8648819923400879, + "num_tokens": 602018167.0, + "step": 15780 + }, + { + "epoch": 2.0075054064368403, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9294277429580688, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8528438806533813, + "num_tokens": 602054485.0, + "step": 15781 + }, + { + "epoch": 2.007632616715431, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.158118724822998, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8717251420021057, + "num_tokens": 602092881.0, + "step": 15782 + }, + { + "epoch": 2.0077598269940213, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.876326084136963, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8679100275039673, + "num_tokens": 602127424.0, + "step": 15783 + }, + { + "epoch": 2.0078870372726114, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9321104288101196, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8722757697105408, + "num_tokens": 602168340.0, + "step": 15784 + }, + { + "epoch": 2.008014247551202, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0213706493377686, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8566046357154846, + "num_tokens": 602207894.0, + "step": 15785 + }, + { + "epoch": 2.0081414578297925, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9349383115768433, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8690441250801086, + "num_tokens": 602245924.0, + "step": 15786 + }, + { + "epoch": 2.008268668108383, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1427321434020996, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8656912446022034, + "num_tokens": 602278417.0, + "step": 15787 + }, + { + "epoch": 2.0083958783869735, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.7918330430984497, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8697524070739746, + "num_tokens": 602318431.0, + "step": 15788 + }, + { + "epoch": 2.008523088665564, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.174950122833252, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8563826680183411, + "num_tokens": 602356315.0, + "step": 15789 + }, + { + "epoch": 2.0086502989441546, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8130656480789185, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8815625309944153, + "num_tokens": 602398865.0, + "step": 15790 + }, + { + "epoch": 2.008777509222745, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.833065152168274, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.876562237739563, + "num_tokens": 602435835.0, + "step": 15791 + }, + { + "epoch": 2.0089047195013356, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8542991876602173, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8732171058654785, + "num_tokens": 602470359.0, + "step": 15792 + }, + { + "epoch": 2.009031929779926, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.8909281492233276, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8697377443313599, + "num_tokens": 602512132.0, + "step": 15793 + }, + { + "epoch": 2.0091591400585167, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.875122308731079, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8637691736221313, + "num_tokens": 602552700.0, + "step": 15794 + }, + { + "epoch": 2.0092863503371072, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9297449588775635, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8656757473945618, + "num_tokens": 602587779.0, + "step": 15795 + }, + { + "epoch": 2.0094135606156978, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9870336055755615, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8573119640350342, + "num_tokens": 602623834.0, + "step": 15796 + }, + { + "epoch": 2.0095407708942883, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.021080493927002, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8789486289024353, + "num_tokens": 602660141.0, + "step": 15797 + }, + { + "epoch": 2.009667981172879, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9094949960708618, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8711471557617188, + "num_tokens": 602698990.0, + "step": 15798 + }, + { + "epoch": 2.0097951914514693, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0544729232788086, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8477962613105774, + "num_tokens": 602732053.0, + "step": 15799 + }, + { + "epoch": 2.00992240173006, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 80.52400970458984, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.866963267326355, + "num_tokens": 602770975.0, + "step": 15800 + }, + { + "epoch": 2.0100496120086504, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 16.620290756225586, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.859001636505127, + "num_tokens": 602813207.0, + "step": 15801 + }, + { + "epoch": 2.010176822287241, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1471176147460938, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8712053298950195, + "num_tokens": 602849951.0, + "step": 15802 + }, + { + "epoch": 2.0103040325658315, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1660995483398438, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8590850830078125, + "num_tokens": 602891418.0, + "step": 15803 + }, + { + "epoch": 2.010431242844422, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.1783230304718018, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.873461127281189, + "num_tokens": 602933039.0, + "step": 15804 + }, + { + "epoch": 2.0105584531230125, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.060126781463623, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8833656311035156, + "num_tokens": 602965546.0, + "step": 15805 + }, + { + "epoch": 2.010685663401603, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.879457712173462, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8662206530570984, + "num_tokens": 603004473.0, + "step": 15806 + }, + { + "epoch": 2.0108128736801936, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.1757404804229736, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.859989583492279, + "num_tokens": 603041644.0, + "step": 15807 + }, + { + "epoch": 2.0109400839587837, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9040025472640991, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8549810647964478, + "num_tokens": 603084698.0, + "step": 15808 + }, + { + "epoch": 2.011067294237374, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 1.9622838497161865, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8499269485473633, + "num_tokens": 603121059.0, + "step": 15809 + }, + { + "epoch": 2.0111945045159647, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0523736476898193, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8652162551879883, + "num_tokens": 603156604.0, + "step": 15810 + }, + { + "epoch": 2.0113217147945552, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.220942258834839, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8681411147117615, + "num_tokens": 603183743.0, + "step": 15811 + }, + { + "epoch": 2.0114489250731458, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.0178818702697754, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8453102111816406, + "num_tokens": 603222099.0, + "step": 15812 + }, + { + "epoch": 2.0115761353517363, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9369710683822632, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8471779227256775, + "num_tokens": 603263467.0, + "step": 15813 + }, + { + "epoch": 2.011703345630327, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.364534854888916, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8468561768531799, + "num_tokens": 603296726.0, + "step": 15814 + }, + { + "epoch": 2.0118305559089174, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.101804733276367, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8732672333717346, + "num_tokens": 603329691.0, + "step": 15815 + }, + { + "epoch": 2.011957766187508, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.846510887145996, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.874545156955719, + "num_tokens": 603371619.0, + "step": 15816 + }, + { + "epoch": 2.0120849764660984, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.380746841430664, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8687900304794312, + "num_tokens": 603400018.0, + "step": 15817 + }, + { + "epoch": 2.012212186744689, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8584785461425781, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.869709849357605, + "num_tokens": 603436971.0, + "step": 15818 + }, + { + "epoch": 2.0123393970232795, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.955781102180481, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8661360740661621, + "num_tokens": 603472802.0, + "step": 15819 + }, + { + "epoch": 2.01246660730187, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9964933395385742, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.871012270450592, + "num_tokens": 603506547.0, + "step": 15820 + }, + { + "epoch": 2.0125938175804605, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9866178035736084, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8499408960342407, + "num_tokens": 603544967.0, + "step": 15821 + }, + { + "epoch": 2.012721027859051, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.062814950942993, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8620635271072388, + "num_tokens": 603582522.0, + "step": 15822 + }, + { + "epoch": 2.0128482381376416, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0130741596221924, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8762567639350891, + "num_tokens": 603619584.0, + "step": 15823 + }, + { + "epoch": 2.012975448416232, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.974739909172058, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8535230159759521, + "num_tokens": 603660542.0, + "step": 15824 + }, + { + "epoch": 2.0131026586948226, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.93980872631073, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8687595129013062, + "num_tokens": 603698900.0, + "step": 15825 + }, + { + "epoch": 2.013229868973413, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0892910957336426, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8470033407211304, + "num_tokens": 603733019.0, + "step": 15826 + }, + { + "epoch": 2.0133570792520037, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0327892303466797, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.873476505279541, + "num_tokens": 603772071.0, + "step": 15827 + }, + { + "epoch": 2.013484289530594, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9567041397094727, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8596553206443787, + "num_tokens": 603811008.0, + "step": 15828 + }, + { + "epoch": 2.0136114998091847, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0217530727386475, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8544186353683472, + "num_tokens": 603852647.0, + "step": 15829 + }, + { + "epoch": 2.0137387100877753, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7696266174316406, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8667330145835876, + "num_tokens": 603894198.0, + "step": 15830 + }, + { + "epoch": 2.013865920366366, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9575830698013306, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8611005544662476, + "num_tokens": 603930168.0, + "step": 15831 + }, + { + "epoch": 2.0139931306449563, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8654510974884033, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8764214515686035, + "num_tokens": 603966189.0, + "step": 15832 + }, + { + "epoch": 2.0141203409235464, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7772680521011353, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8634296655654907, + "num_tokens": 604011110.0, + "step": 15833 + }, + { + "epoch": 2.014247551202137, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.1025302410125732, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8492172360420227, + "num_tokens": 604044256.0, + "step": 15834 + }, + { + "epoch": 2.0143747614807275, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.1564841270446777, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8748700618743896, + "num_tokens": 604078652.0, + "step": 15835 + }, + { + "epoch": 2.014501971759318, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9530359506607056, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8751083612442017, + "num_tokens": 604112302.0, + "step": 15836 + }, + { + "epoch": 2.0146291820379085, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7610137462615967, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8831169605255127, + "num_tokens": 604150011.0, + "step": 15837 + }, + { + "epoch": 2.014756392316499, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.1707775592803955, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8677855134010315, + "num_tokens": 604188859.0, + "step": 15838 + }, + { + "epoch": 2.0148836025950896, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8062843084335327, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8619148135185242, + "num_tokens": 604232273.0, + "step": 15839 + }, + { + "epoch": 2.01501081287368, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0491974353790283, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8731260299682617, + "num_tokens": 604265236.0, + "step": 15840 + }, + { + "epoch": 2.0151380231522706, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.066298007965088, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.880622923374176, + "num_tokens": 604304784.0, + "step": 15841 + }, + { + "epoch": 2.015265233430861, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8218239545822144, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8926893472671509, + "num_tokens": 604341461.0, + "step": 15842 + }, + { + "epoch": 2.0153924437094517, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9519683122634888, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8735941052436829, + "num_tokens": 604378628.0, + "step": 15843 + }, + { + "epoch": 2.0155196539880422, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9726717472076416, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8769940137863159, + "num_tokens": 604418070.0, + "step": 15844 + }, + { + "epoch": 2.0156468642666328, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.830476999282837, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8782526254653931, + "num_tokens": 604454445.0, + "step": 15845 + }, + { + "epoch": 2.0157740745452233, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.0224390029907227, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8708233833312988, + "num_tokens": 604488559.0, + "step": 15846 + }, + { + "epoch": 2.015901284823814, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.094618797302246, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8583264946937561, + "num_tokens": 604522538.0, + "step": 15847 + }, + { + "epoch": 2.0160284951024043, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.715743064880371, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.884199857711792, + "num_tokens": 604561750.0, + "step": 15848 + }, + { + "epoch": 2.016155705380995, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9019118547439575, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8800078630447388, + "num_tokens": 604594961.0, + "step": 15849 + }, + { + "epoch": 2.0162829156595854, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.026979684829712, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8565255999565125, + "num_tokens": 604631301.0, + "step": 15850 + }, + { + "epoch": 2.016410125938176, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.1349992752075195, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8474729061126709, + "num_tokens": 604666049.0, + "step": 15851 + }, + { + "epoch": 2.0165373362167665, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.1356842517852783, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.88775634765625, + "num_tokens": 604701984.0, + "step": 15852 + }, + { + "epoch": 2.016664546495357, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9261572360992432, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8782458901405334, + "num_tokens": 604738414.0, + "step": 15853 + }, + { + "epoch": 2.0167917567739475, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7700146436691284, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8706042766571045, + "num_tokens": 604779961.0, + "step": 15854 + }, + { + "epoch": 2.016918967052538, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.859459400177002, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.860880970954895, + "num_tokens": 604819729.0, + "step": 15855 + }, + { + "epoch": 2.0170461773311286, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7677491903305054, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8777825832366943, + "num_tokens": 604864108.0, + "step": 15856 + }, + { + "epoch": 2.0171733876097186, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8863589763641357, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8609234094619751, + "num_tokens": 604905582.0, + "step": 15857 + }, + { + "epoch": 2.017300597888309, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.185419797897339, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8764017820358276, + "num_tokens": 604943931.0, + "step": 15858 + }, + { + "epoch": 2.0174278081668997, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.949084758758545, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8635900020599365, + "num_tokens": 604982100.0, + "step": 15859 + }, + { + "epoch": 2.0175550184454902, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8592805862426758, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8567306995391846, + "num_tokens": 605026289.0, + "step": 15860 + }, + { + "epoch": 2.0176822287240808, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8133809566497803, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8721255660057068, + "num_tokens": 605070214.0, + "step": 15861 + }, + { + "epoch": 2.0178094390026713, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7522112131118774, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.862469494342804, + "num_tokens": 605113495.0, + "step": 15862 + }, + { + "epoch": 2.017936649281262, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.977670431137085, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8708066940307617, + "num_tokens": 605145717.0, + "step": 15863 + }, + { + "epoch": 2.0180638595598523, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.047950267791748, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8589788675308228, + "num_tokens": 605180930.0, + "step": 15864 + }, + { + "epoch": 2.018191069838443, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9912577867507935, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8742356300354004, + "num_tokens": 605214004.0, + "step": 15865 + }, + { + "epoch": 2.0183182801170334, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9814578294754028, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8684468269348145, + "num_tokens": 605253244.0, + "step": 15866 + }, + { + "epoch": 2.018445490395624, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.075160264968872, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8622972965240479, + "num_tokens": 605293832.0, + "step": 15867 + }, + { + "epoch": 2.0185727006742145, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8863736391067505, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8768699169158936, + "num_tokens": 605333811.0, + "step": 15868 + }, + { + "epoch": 2.018699910952805, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0170586109161377, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8671858310699463, + "num_tokens": 605371560.0, + "step": 15869 + }, + { + "epoch": 2.0188271212313955, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9388091564178467, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8655431270599365, + "num_tokens": 605410555.0, + "step": 15870 + }, + { + "epoch": 2.018954331509986, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8802533149719238, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8614694476127625, + "num_tokens": 605451851.0, + "step": 15871 + }, + { + "epoch": 2.0190815417885766, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.1017653942108154, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.857538640499115, + "num_tokens": 605490442.0, + "step": 15872 + }, + { + "epoch": 2.019208752067167, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.976802110671997, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8651893138885498, + "num_tokens": 605525237.0, + "step": 15873 + }, + { + "epoch": 2.0193359623457576, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8537380695343018, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8716800808906555, + "num_tokens": 605563624.0, + "step": 15874 + }, + { + "epoch": 2.019463172624348, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9533318281173706, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8771792054176331, + "num_tokens": 605597419.0, + "step": 15875 + }, + { + "epoch": 2.0195903829029387, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.013503074645996, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.858399510383606, + "num_tokens": 605639954.0, + "step": 15876 + }, + { + "epoch": 2.019717593181529, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8800030946731567, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8678403496742249, + "num_tokens": 605678226.0, + "step": 15877 + }, + { + "epoch": 2.0198448034601197, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8481584787368774, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8764497637748718, + "num_tokens": 605719925.0, + "step": 15878 + }, + { + "epoch": 2.0199720137387103, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0159473419189453, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8488343954086304, + "num_tokens": 605761282.0, + "step": 15879 + }, + { + "epoch": 2.020099224017301, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.2666845321655273, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8773161172866821, + "num_tokens": 605800681.0, + "step": 15880 + }, + { + "epoch": 2.0202264342958913, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.160447835922241, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8496392965316772, + "num_tokens": 605836282.0, + "step": 15881 + }, + { + "epoch": 2.0203536445744814, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8408353328704834, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8690613508224487, + "num_tokens": 605880360.0, + "step": 15882 + }, + { + "epoch": 2.020480854853072, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9251593351364136, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8763710260391235, + "num_tokens": 605918363.0, + "step": 15883 + }, + { + "epoch": 2.0206080651316625, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9806982278823853, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8804928660392761, + "num_tokens": 605953032.0, + "step": 15884 + }, + { + "epoch": 2.020735275410253, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.005404472351074, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8743904829025269, + "num_tokens": 605992633.0, + "step": 15885 + }, + { + "epoch": 2.0208624856888435, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9056848287582397, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8676950335502625, + "num_tokens": 606031776.0, + "step": 15886 + }, + { + "epoch": 2.020989695967434, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9084421396255493, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8604447841644287, + "num_tokens": 606070614.0, + "step": 15887 + }, + { + "epoch": 2.0211169062460246, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6775130033493042, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8670076131820679, + "num_tokens": 606115898.0, + "step": 15888 + }, + { + "epoch": 2.021244116524615, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.805406332015991, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8700951933860779, + "num_tokens": 606150231.0, + "step": 15889 + }, + { + "epoch": 2.0213713268032056, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9200416803359985, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8597466349601746, + "num_tokens": 606191527.0, + "step": 15890 + }, + { + "epoch": 2.021498537081796, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.034059762954712, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8605329990386963, + "num_tokens": 606229070.0, + "step": 15891 + }, + { + "epoch": 2.0216257473603867, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9101803302764893, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8674307465553284, + "num_tokens": 606269555.0, + "step": 15892 + }, + { + "epoch": 2.021752957638977, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0000555515289307, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.861531138420105, + "num_tokens": 606302040.0, + "step": 15893 + }, + { + "epoch": 2.0218801679175677, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7466659545898438, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8754243850708008, + "num_tokens": 606341769.0, + "step": 15894 + }, + { + "epoch": 2.0220073781961583, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9251619577407837, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8536766767501831, + "num_tokens": 606379989.0, + "step": 15895 + }, + { + "epoch": 2.022134588474749, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9780694246292114, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8612544536590576, + "num_tokens": 606414396.0, + "step": 15896 + }, + { + "epoch": 2.0222617987533393, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0343987941741943, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8704850673675537, + "num_tokens": 606445949.0, + "step": 15897 + }, + { + "epoch": 2.02238900903193, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8430551290512085, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8697246313095093, + "num_tokens": 606484751.0, + "step": 15898 + }, + { + "epoch": 2.0225162193105204, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8953919410705566, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8708629608154297, + "num_tokens": 606525390.0, + "step": 15899 + }, + { + "epoch": 2.022643429589111, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.967803955078125, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8640443086624146, + "num_tokens": 606563364.0, + "step": 15900 + }, + { + "epoch": 2.0227706398677014, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.2272427082061768, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8696094751358032, + "num_tokens": 606598140.0, + "step": 15901 + }, + { + "epoch": 2.022897850146292, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8706711530685425, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8604245185852051, + "num_tokens": 606636480.0, + "step": 15902 + }, + { + "epoch": 2.0230250604248825, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8536863327026367, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8693026304244995, + "num_tokens": 606676044.0, + "step": 15903 + }, + { + "epoch": 2.023152270703473, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8336809873580933, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8599979281425476, + "num_tokens": 606713262.0, + "step": 15904 + }, + { + "epoch": 2.0232794809820636, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.928939938545227, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8629300594329834, + "num_tokens": 606754844.0, + "step": 15905 + }, + { + "epoch": 2.0234066912606536, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6834619045257568, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8678078651428223, + "num_tokens": 606800910.0, + "step": 15906 + }, + { + "epoch": 2.023533901539244, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.690503716468811, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.861902117729187, + "num_tokens": 606841855.0, + "step": 15907 + }, + { + "epoch": 2.0236611118178347, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7287461757659912, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8556779623031616, + "num_tokens": 606888261.0, + "step": 15908 + }, + { + "epoch": 2.0237883220964252, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8846371173858643, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.859377384185791, + "num_tokens": 606926235.0, + "step": 15909 + }, + { + "epoch": 2.0239155323750158, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9529811143875122, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8626593351364136, + "num_tokens": 606962527.0, + "step": 15910 + }, + { + "epoch": 2.0240427426536063, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9149272441864014, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8731558322906494, + "num_tokens": 607008924.0, + "step": 15911 + }, + { + "epoch": 2.024169952932197, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8173027038574219, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8635064363479614, + "num_tokens": 607049788.0, + "step": 15912 + }, + { + "epoch": 2.0242971632107873, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9896924495697021, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8602863550186157, + "num_tokens": 607087592.0, + "step": 15913 + }, + { + "epoch": 2.024424373489378, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.8231585025787354, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8832098245620728, + "num_tokens": 607119954.0, + "step": 15914 + }, + { + "epoch": 2.0245515837679684, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9670745134353638, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8588255047798157, + "num_tokens": 607156511.0, + "step": 15915 + }, + { + "epoch": 2.024678794046559, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9229507446289062, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8748050332069397, + "num_tokens": 607198292.0, + "step": 15916 + }, + { + "epoch": 2.0248060043251495, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9417365789413452, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8632916212081909, + "num_tokens": 607237572.0, + "step": 15917 + }, + { + "epoch": 2.02493321460374, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.824203372001648, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.858140766620636, + "num_tokens": 607282321.0, + "step": 15918 + }, + { + "epoch": 2.0250604248823305, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.021364450454712, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.86463862657547, + "num_tokens": 607313355.0, + "step": 15919 + }, + { + "epoch": 2.025187635160921, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.891588568687439, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8638251423835754, + "num_tokens": 607351961.0, + "step": 15920 + }, + { + "epoch": 2.0253148454395116, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7817531824111938, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8696236610412598, + "num_tokens": 607391368.0, + "step": 15921 + }, + { + "epoch": 2.025442055718102, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8873740434646606, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8848433494567871, + "num_tokens": 607427328.0, + "step": 15922 + }, + { + "epoch": 2.0255692659966926, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.0060105323791504, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8606806993484497, + "num_tokens": 607468243.0, + "step": 15923 + }, + { + "epoch": 2.025696476275283, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9291893243789673, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8500306010246277, + "num_tokens": 607502466.0, + "step": 15924 + }, + { + "epoch": 2.0258236865538737, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9435019493103027, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8573166131973267, + "num_tokens": 607544093.0, + "step": 15925 + }, + { + "epoch": 2.025950896832464, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.1529335975646973, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.872699499130249, + "num_tokens": 607576054.0, + "step": 15926 + }, + { + "epoch": 2.0260781071110547, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.007793664932251, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8797138929367065, + "num_tokens": 607608462.0, + "step": 15927 + }, + { + "epoch": 2.0262053173896453, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.5996804237365723, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8594032526016235, + "num_tokens": 607645272.0, + "step": 15928 + }, + { + "epoch": 2.026332527668236, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8629570007324219, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8698359131813049, + "num_tokens": 607682585.0, + "step": 15929 + }, + { + "epoch": 2.0264597379468263, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7493577003479004, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8843721151351929, + "num_tokens": 607721302.0, + "step": 15930 + }, + { + "epoch": 2.0265869482254164, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9604099988937378, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8632615208625793, + "num_tokens": 607758357.0, + "step": 15931 + }, + { + "epoch": 2.026714158504007, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.879847526550293, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8675004243850708, + "num_tokens": 607795467.0, + "step": 15932 + }, + { + "epoch": 2.0268413687825975, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9177943468093872, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.861322283744812, + "num_tokens": 607833192.0, + "step": 15933 + }, + { + "epoch": 2.026968579061188, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.0839786529541016, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8765413165092468, + "num_tokens": 607866672.0, + "step": 15934 + }, + { + "epoch": 2.0270957893397785, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.050219774246216, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8664976358413696, + "num_tokens": 607907976.0, + "step": 15935 + }, + { + "epoch": 2.027222999618369, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8905657529830933, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.870437741279602, + "num_tokens": 607946009.0, + "step": 15936 + }, + { + "epoch": 2.0273502098969596, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.8121907711029053, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8711782693862915, + "num_tokens": 607986800.0, + "step": 15937 + }, + { + "epoch": 2.02747742017555, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.2711076736450195, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8605025410652161, + "num_tokens": 608021807.0, + "step": 15938 + }, + { + "epoch": 2.0276046304541406, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 3.140531063079834, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.866046667098999, + "num_tokens": 608060294.0, + "step": 15939 + }, + { + "epoch": 2.027731840732731, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.948788046836853, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8655620217323303, + "num_tokens": 608100790.0, + "step": 15940 + }, + { + "epoch": 2.0278590510113217, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.024670124053955, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8622255921363831, + "num_tokens": 608142164.0, + "step": 15941 + }, + { + "epoch": 2.027986261289912, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9929771423339844, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8668404221534729, + "num_tokens": 608184175.0, + "step": 15942 + }, + { + "epoch": 2.0281134715685027, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.78639817237854, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8605746030807495, + "num_tokens": 608226801.0, + "step": 15943 + }, + { + "epoch": 2.0282406818470933, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.812626838684082, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8743929266929626, + "num_tokens": 608265155.0, + "step": 15944 + }, + { + "epoch": 2.028367892125684, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8056198358535767, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8768917322158813, + "num_tokens": 608303817.0, + "step": 15945 + }, + { + "epoch": 2.0284951024042743, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9520106315612793, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8744565844535828, + "num_tokens": 608338845.0, + "step": 15946 + }, + { + "epoch": 2.028622312682865, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.8850550651550293, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8666380643844604, + "num_tokens": 608377661.0, + "step": 15947 + }, + { + "epoch": 2.0287495229614554, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 3.1939384937286377, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8665585517883301, + "num_tokens": 608412897.0, + "step": 15948 + }, + { + "epoch": 2.028876733240046, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.014932155609131, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8590415716171265, + "num_tokens": 608453584.0, + "step": 15949 + }, + { + "epoch": 2.0290039435186364, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0216875076293945, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8634116053581238, + "num_tokens": 608492211.0, + "step": 15950 + }, + { + "epoch": 2.029131153797227, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0164854526519775, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8628427982330322, + "num_tokens": 608534317.0, + "step": 15951 + }, + { + "epoch": 2.0292583640758175, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.90850031375885, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8774126768112183, + "num_tokens": 608569763.0, + "step": 15952 + }, + { + "epoch": 2.029385574354408, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.730133295059204, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.869640052318573, + "num_tokens": 608612145.0, + "step": 15953 + }, + { + "epoch": 2.0295127846329986, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8159955739974976, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8672342896461487, + "num_tokens": 608649528.0, + "step": 15954 + }, + { + "epoch": 2.0296399949115886, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.072503089904785, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8599808812141418, + "num_tokens": 608686973.0, + "step": 15955 + }, + { + "epoch": 2.029767205190179, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8383452892303467, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8687800765037537, + "num_tokens": 608729169.0, + "step": 15956 + }, + { + "epoch": 2.0298944154687697, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0679984092712402, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8645331263542175, + "num_tokens": 608764502.0, + "step": 15957 + }, + { + "epoch": 2.0300216257473602, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9030513763427734, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8676245808601379, + "num_tokens": 608805190.0, + "step": 15958 + }, + { + "epoch": 2.0301488360259508, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8468531370162964, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8722813129425049, + "num_tokens": 608840006.0, + "step": 15959 + }, + { + "epoch": 2.0302760463045413, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.962308645248413, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8742523193359375, + "num_tokens": 608871551.0, + "step": 15960 + }, + { + "epoch": 2.030403256583132, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8959259986877441, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8452222347259521, + "num_tokens": 608909610.0, + "step": 15961 + }, + { + "epoch": 2.0305304668617223, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.3147003650665283, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8635200262069702, + "num_tokens": 608950758.0, + "step": 15962 + }, + { + "epoch": 2.030657677140313, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9845267534255981, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8544604778289795, + "num_tokens": 608990695.0, + "step": 15963 + }, + { + "epoch": 2.0307848874189034, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9310201406478882, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8817148208618164, + "num_tokens": 609030978.0, + "step": 15964 + }, + { + "epoch": 2.030912097697494, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9078999757766724, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8765245676040649, + "num_tokens": 609071974.0, + "step": 15965 + }, + { + "epoch": 2.0310393079760845, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.97869074344635, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8699153661727905, + "num_tokens": 609111268.0, + "step": 15966 + }, + { + "epoch": 2.031166518254675, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9712551832199097, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8626450300216675, + "num_tokens": 609149681.0, + "step": 15967 + }, + { + "epoch": 2.0312937285332655, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.4390227794647217, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8721957206726074, + "num_tokens": 609189121.0, + "step": 15968 + }, + { + "epoch": 2.031420938811856, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.0213966369628906, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8764175772666931, + "num_tokens": 609228070.0, + "step": 15969 + }, + { + "epoch": 2.0315481490904466, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.809626817703247, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8694819211959839, + "num_tokens": 609269718.0, + "step": 15970 + }, + { + "epoch": 2.031675359369037, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9177989959716797, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8691126108169556, + "num_tokens": 609307236.0, + "step": 15971 + }, + { + "epoch": 2.0318025696476276, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9852180480957031, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8477675914764404, + "num_tokens": 609342993.0, + "step": 15972 + }, + { + "epoch": 2.031929779926218, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.996725082397461, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8742791414260864, + "num_tokens": 609376817.0, + "step": 15973 + }, + { + "epoch": 2.0320569902048087, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9242247343063354, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8651347160339355, + "num_tokens": 609411209.0, + "step": 15974 + }, + { + "epoch": 2.032184200483399, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.316887617111206, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8845223188400269, + "num_tokens": 609449055.0, + "step": 15975 + }, + { + "epoch": 2.0323114107619897, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.026451826095581, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8653214573860168, + "num_tokens": 609486332.0, + "step": 15976 + }, + { + "epoch": 2.0324386210405803, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9282264709472656, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8527412414550781, + "num_tokens": 609526924.0, + "step": 15977 + }, + { + "epoch": 2.032565831319171, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8691515922546387, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8796561360359192, + "num_tokens": 609559542.0, + "step": 15978 + }, + { + "epoch": 2.032693041597761, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7958580255508423, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8607261180877686, + "num_tokens": 609598989.0, + "step": 15979 + }, + { + "epoch": 2.0328202518763514, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.0325405597686768, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8543308973312378, + "num_tokens": 609635790.0, + "step": 15980 + }, + { + "epoch": 2.032947462154942, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8916014432907104, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.859761118888855, + "num_tokens": 609670120.0, + "step": 15981 + }, + { + "epoch": 2.0330746724335325, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7360390424728394, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8651976585388184, + "num_tokens": 609713260.0, + "step": 15982 + }, + { + "epoch": 2.033201882712123, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7353765964508057, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8724849820137024, + "num_tokens": 609753236.0, + "step": 15983 + }, + { + "epoch": 2.0333290929907135, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8703237771987915, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8698129653930664, + "num_tokens": 609791629.0, + "step": 15984 + }, + { + "epoch": 2.033456303269304, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8912285566329956, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8514487147331238, + "num_tokens": 609833457.0, + "step": 15985 + }, + { + "epoch": 2.0335835135478946, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.093740224838257, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8588039875030518, + "num_tokens": 609866007.0, + "step": 15986 + }, + { + "epoch": 2.033710723826485, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.6373202800750732, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8715544939041138, + "num_tokens": 609903346.0, + "step": 15987 + }, + { + "epoch": 2.0338379341050756, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0260169506073, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8715739846229553, + "num_tokens": 609935885.0, + "step": 15988 + }, + { + "epoch": 2.033965144383666, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.081397533416748, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8604627251625061, + "num_tokens": 609971127.0, + "step": 15989 + }, + { + "epoch": 2.0340923546622567, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8535553216934204, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8583192825317383, + "num_tokens": 610016719.0, + "step": 15990 + }, + { + "epoch": 2.034219564940847, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.925199031829834, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8556877374649048, + "num_tokens": 610057002.0, + "step": 15991 + }, + { + "epoch": 2.0343467752194377, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8280044794082642, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.870934009552002, + "num_tokens": 610092779.0, + "step": 15992 + }, + { + "epoch": 2.0344739854980283, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.779967188835144, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.856637716293335, + "num_tokens": 610137092.0, + "step": 15993 + }, + { + "epoch": 2.034601195776619, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7973690032958984, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8752303123474121, + "num_tokens": 610178632.0, + "step": 15994 + }, + { + "epoch": 2.0347284060552093, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0661661624908447, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8653230667114258, + "num_tokens": 610219789.0, + "step": 15995 + }, + { + "epoch": 2.0348556163338, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0383877754211426, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8532220721244812, + "num_tokens": 610259104.0, + "step": 15996 + }, + { + "epoch": 2.0349828266123904, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3840746879577637, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8592755794525146, + "num_tokens": 610296317.0, + "step": 15997 + }, + { + "epoch": 2.035110036890981, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7935335636138916, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8655943870544434, + "num_tokens": 610339300.0, + "step": 15998 + }, + { + "epoch": 2.0352372471695714, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9936773777008057, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8603065013885498, + "num_tokens": 610373917.0, + "step": 15999 + }, + { + "epoch": 2.035364457448162, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.056361436843872, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8658446669578552, + "num_tokens": 610419571.0, + "step": 16000 + }, + { + "epoch": 2.0354916677267525, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9738948345184326, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8582562208175659, + "num_tokens": 610458073.0, + "step": 16001 + }, + { + "epoch": 2.035618878005343, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.5339760780334473, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8565311431884766, + "num_tokens": 610497223.0, + "step": 16002 + }, + { + "epoch": 2.0357460882839336, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0132570266723633, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8478515148162842, + "num_tokens": 610532563.0, + "step": 16003 + }, + { + "epoch": 2.0358732985625236, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0610694885253906, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8780142664909363, + "num_tokens": 610566644.0, + "step": 16004 + }, + { + "epoch": 2.036000508841114, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.991443395614624, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8543832302093506, + "num_tokens": 610604755.0, + "step": 16005 + }, + { + "epoch": 2.0361277191197047, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9600920677185059, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8693006038665771, + "num_tokens": 610645905.0, + "step": 16006 + }, + { + "epoch": 2.036254929398295, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0068905353546143, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8842262625694275, + "num_tokens": 610686800.0, + "step": 16007 + }, + { + "epoch": 2.0363821396768857, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9617277383804321, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8658573627471924, + "num_tokens": 610728291.0, + "step": 16008 + }, + { + "epoch": 2.0365093499554763, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1914005279541016, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8638946413993835, + "num_tokens": 610759818.0, + "step": 16009 + }, + { + "epoch": 2.036636560234067, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9387221336364746, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8521507978439331, + "num_tokens": 610800125.0, + "step": 16010 + }, + { + "epoch": 2.0367637705126573, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9308929443359375, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8579340577125549, + "num_tokens": 610839261.0, + "step": 16011 + }, + { + "epoch": 2.036890980791248, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.921440839767456, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8709986209869385, + "num_tokens": 610879092.0, + "step": 16012 + }, + { + "epoch": 2.0370181910698384, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9655081033706665, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.86644047498703, + "num_tokens": 610914494.0, + "step": 16013 + }, + { + "epoch": 2.037145401348429, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0123727321624756, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8763355016708374, + "num_tokens": 610950784.0, + "step": 16014 + }, + { + "epoch": 2.0372726116270194, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.850486159324646, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8552263975143433, + "num_tokens": 610989254.0, + "step": 16015 + }, + { + "epoch": 2.03739982190561, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.082343339920044, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8593855500221252, + "num_tokens": 611022668.0, + "step": 16016 + }, + { + "epoch": 2.0375270321842005, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9340801239013672, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8672715425491333, + "num_tokens": 611062110.0, + "step": 16017 + }, + { + "epoch": 2.037654242462791, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8179070949554443, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8716745972633362, + "num_tokens": 611098663.0, + "step": 16018 + }, + { + "epoch": 2.0377814527413816, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9438748359680176, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8639721274375916, + "num_tokens": 611134058.0, + "step": 16019 + }, + { + "epoch": 2.037908663019972, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.026479721069336, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.866360068321228, + "num_tokens": 611174748.0, + "step": 16020 + }, + { + "epoch": 2.0380358732985626, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.806113600730896, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8828028440475464, + "num_tokens": 611216792.0, + "step": 16021 + }, + { + "epoch": 2.038163083577153, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0171263217926025, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8757221698760986, + "num_tokens": 611253996.0, + "step": 16022 + }, + { + "epoch": 2.0382902938557437, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9315904378890991, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8760761022567749, + "num_tokens": 611295351.0, + "step": 16023 + }, + { + "epoch": 2.038417504134334, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9663008451461792, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8654884696006775, + "num_tokens": 611330513.0, + "step": 16024 + }, + { + "epoch": 2.0385447144129247, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9837855100631714, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8744543194770813, + "num_tokens": 611369679.0, + "step": 16025 + }, + { + "epoch": 2.0386719246915153, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.211641550064087, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8736433982849121, + "num_tokens": 611410398.0, + "step": 16026 + }, + { + "epoch": 2.038799134970106, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9468162059783936, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8582850694656372, + "num_tokens": 611447151.0, + "step": 16027 + }, + { + "epoch": 2.0389263452486963, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8453489542007446, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8576511740684509, + "num_tokens": 611486923.0, + "step": 16028 + }, + { + "epoch": 2.0390535555272864, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1583776473999023, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8713919520378113, + "num_tokens": 611519502.0, + "step": 16029 + }, + { + "epoch": 2.039180765805877, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9744384288787842, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8523703813552856, + "num_tokens": 611557647.0, + "step": 16030 + }, + { + "epoch": 2.0393079760844675, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8692879676818848, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8651047945022583, + "num_tokens": 611597405.0, + "step": 16031 + }, + { + "epoch": 2.039435186363058, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9042620658874512, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8593375086784363, + "num_tokens": 611635996.0, + "step": 16032 + }, + { + "epoch": 2.0395623966416485, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.6160364151000977, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8739414215087891, + "num_tokens": 611677900.0, + "step": 16033 + }, + { + "epoch": 2.039689606920239, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8823661804199219, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8697496652603149, + "num_tokens": 611719184.0, + "step": 16034 + }, + { + "epoch": 2.0398168171988296, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7944883108139038, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8651491403579712, + "num_tokens": 611761270.0, + "step": 16035 + }, + { + "epoch": 2.03994402747742, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 5.161642551422119, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8646740317344666, + "num_tokens": 611794939.0, + "step": 16036 + }, + { + "epoch": 2.0400712377560106, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8489418029785156, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8775557279586792, + "num_tokens": 611834093.0, + "step": 16037 + }, + { + "epoch": 2.040198448034601, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9113341569900513, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.860532283782959, + "num_tokens": 611871327.0, + "step": 16038 + }, + { + "epoch": 2.0403256583131917, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0587470531463623, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8586162328720093, + "num_tokens": 611907837.0, + "step": 16039 + }, + { + "epoch": 2.040452868591782, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0098330974578857, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8474029302597046, + "num_tokens": 611950964.0, + "step": 16040 + }, + { + "epoch": 2.0405800788703727, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.188232898712158, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8618117570877075, + "num_tokens": 611985867.0, + "step": 16041 + }, + { + "epoch": 2.0407072891489633, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.9096853733062744, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8700687289237976, + "num_tokens": 612017819.0, + "step": 16042 + }, + { + "epoch": 2.040834499427554, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.7290098667144775, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8502055406570435, + "num_tokens": 612060669.0, + "step": 16043 + }, + { + "epoch": 2.0409617097061443, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.138593912124634, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8500174283981323, + "num_tokens": 612099801.0, + "step": 16044 + }, + { + "epoch": 2.041088919984735, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8428387641906738, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.86978679895401, + "num_tokens": 612139454.0, + "step": 16045 + }, + { + "epoch": 2.0412161302633254, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8368911743164062, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8468188047409058, + "num_tokens": 612181695.0, + "step": 16046 + }, + { + "epoch": 2.041343340541916, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6778125762939453, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.881592869758606, + "num_tokens": 612223200.0, + "step": 16047 + }, + { + "epoch": 2.0414705508205064, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8357892036437988, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.868008553981781, + "num_tokens": 612265946.0, + "step": 16048 + }, + { + "epoch": 2.041597761099097, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.898803472518921, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8831871747970581, + "num_tokens": 612301246.0, + "step": 16049 + }, + { + "epoch": 2.0417249713776875, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.020779848098755, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8505947589874268, + "num_tokens": 612337789.0, + "step": 16050 + }, + { + "epoch": 2.041852181656278, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8843910694122314, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8785827159881592, + "num_tokens": 612372391.0, + "step": 16051 + }, + { + "epoch": 2.0419793919348685, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.729185104370117, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8689595460891724, + "num_tokens": 612408806.0, + "step": 16052 + }, + { + "epoch": 2.0421066022134586, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.099360227584839, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8774158954620361, + "num_tokens": 612446349.0, + "step": 16053 + }, + { + "epoch": 2.042233812492049, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.784817099571228, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8788180351257324, + "num_tokens": 612489899.0, + "step": 16054 + }, + { + "epoch": 2.0423610227706397, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8897161483764648, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8618124127388, + "num_tokens": 612532314.0, + "step": 16055 + }, + { + "epoch": 2.04248823304923, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8961620330810547, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8739519119262695, + "num_tokens": 612568033.0, + "step": 16056 + }, + { + "epoch": 2.0426154433278207, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8490639925003052, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8722431659698486, + "num_tokens": 612605160.0, + "step": 16057 + }, + { + "epoch": 2.0427426536064113, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7905641794204712, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.853867769241333, + "num_tokens": 612647851.0, + "step": 16058 + }, + { + "epoch": 2.042869863885002, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.97324800491333, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8659600019454956, + "num_tokens": 612684770.0, + "step": 16059 + }, + { + "epoch": 2.0429970741635923, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.040229082107544, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8559699654579163, + "num_tokens": 612721689.0, + "step": 16060 + }, + { + "epoch": 2.043124284442183, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9143671989440918, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.864638090133667, + "num_tokens": 612757256.0, + "step": 16061 + }, + { + "epoch": 2.0432514947207734, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8276795148849487, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8721969723701477, + "num_tokens": 612797806.0, + "step": 16062 + }, + { + "epoch": 2.043378704999364, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8621020317077637, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8771255016326904, + "num_tokens": 612833507.0, + "step": 16063 + }, + { + "epoch": 2.0435059152779544, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9392926692962646, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8722102046012878, + "num_tokens": 612872920.0, + "step": 16064 + }, + { + "epoch": 2.043633125556545, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9291174411773682, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8733603358268738, + "num_tokens": 612909534.0, + "step": 16065 + }, + { + "epoch": 2.0437603358351355, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8067580461502075, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.863477349281311, + "num_tokens": 612950531.0, + "step": 16066 + }, + { + "epoch": 2.043887546113726, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.021298885345459, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8690674901008606, + "num_tokens": 612984638.0, + "step": 16067 + }, + { + "epoch": 2.0440147563923166, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9811830520629883, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8772738575935364, + "num_tokens": 613020632.0, + "step": 16068 + }, + { + "epoch": 2.044141966670907, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.975532054901123, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8702524900436401, + "num_tokens": 613057381.0, + "step": 16069 + }, + { + "epoch": 2.0442691769494976, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8607449531555176, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8636859655380249, + "num_tokens": 613094287.0, + "step": 16070 + }, + { + "epoch": 2.044396387228088, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.975326418876648, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8732995986938477, + "num_tokens": 613126315.0, + "step": 16071 + }, + { + "epoch": 2.0445235975066787, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0082104206085205, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8622267842292786, + "num_tokens": 613160970.0, + "step": 16072 + }, + { + "epoch": 2.044650807785269, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0621228218078613, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8693448305130005, + "num_tokens": 613204685.0, + "step": 16073 + }, + { + "epoch": 2.0447780180638597, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.012787342071533, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8753649592399597, + "num_tokens": 613242394.0, + "step": 16074 + }, + { + "epoch": 2.0449052283424503, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8288958072662354, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8749688863754272, + "num_tokens": 613283517.0, + "step": 16075 + }, + { + "epoch": 2.045032438621041, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0270633697509766, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.869925856590271, + "num_tokens": 613317237.0, + "step": 16076 + }, + { + "epoch": 2.045159648899631, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8332289457321167, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8703750967979431, + "num_tokens": 613355741.0, + "step": 16077 + }, + { + "epoch": 2.0452868591782214, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9040981531143188, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8578404784202576, + "num_tokens": 613392131.0, + "step": 16078 + }, + { + "epoch": 2.045414069456812, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9259594678878784, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8572462797164917, + "num_tokens": 613433687.0, + "step": 16079 + }, + { + "epoch": 2.0455412797354025, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8758792877197266, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8703652620315552, + "num_tokens": 613472882.0, + "step": 16080 + }, + { + "epoch": 2.045668490013993, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.739281177520752, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8716894388198853, + "num_tokens": 613515869.0, + "step": 16081 + }, + { + "epoch": 2.0457957002925835, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.875585913658142, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8675057888031006, + "num_tokens": 613551835.0, + "step": 16082 + }, + { + "epoch": 2.045922910571174, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8765168190002441, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8654923439025879, + "num_tokens": 613592277.0, + "step": 16083 + }, + { + "epoch": 2.0460501208497646, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0390450954437256, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8666731119155884, + "num_tokens": 613625594.0, + "step": 16084 + }, + { + "epoch": 2.046177331128355, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.99408757686615, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8693735599517822, + "num_tokens": 613659924.0, + "step": 16085 + }, + { + "epoch": 2.0463045414069456, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9983628988265991, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8498591780662537, + "num_tokens": 613697067.0, + "step": 16086 + }, + { + "epoch": 2.046431751685536, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9203550815582275, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8746829032897949, + "num_tokens": 613738211.0, + "step": 16087 + }, + { + "epoch": 2.0465589619641267, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7654889822006226, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8729904890060425, + "num_tokens": 613782590.0, + "step": 16088 + }, + { + "epoch": 2.046686172242717, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8095910549163818, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8748703598976135, + "num_tokens": 613825770.0, + "step": 16089 + }, + { + "epoch": 2.0468133825213077, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8457999229431152, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8716997504234314, + "num_tokens": 613864025.0, + "step": 16090 + }, + { + "epoch": 2.0469405927998983, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7693191766738892, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8695586919784546, + "num_tokens": 613907181.0, + "step": 16091 + }, + { + "epoch": 2.047067803078489, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9111372232437134, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8583439588546753, + "num_tokens": 613951258.0, + "step": 16092 + }, + { + "epoch": 2.0471950133570793, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.896289348602295, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8651524186134338, + "num_tokens": 613991221.0, + "step": 16093 + }, + { + "epoch": 2.04732222363567, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.0014967918395996, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8777657151222229, + "num_tokens": 614026862.0, + "step": 16094 + }, + { + "epoch": 2.0474494339142604, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8493038415908813, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8574290871620178, + "num_tokens": 614067560.0, + "step": 16095 + }, + { + "epoch": 2.047576644192851, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.1002798080444336, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8707733154296875, + "num_tokens": 614106974.0, + "step": 16096 + }, + { + "epoch": 2.0477038544714414, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8555033206939697, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8937608599662781, + "num_tokens": 614145641.0, + "step": 16097 + }, + { + "epoch": 2.047831064750032, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9133799076080322, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8723145127296448, + "num_tokens": 614181476.0, + "step": 16098 + }, + { + "epoch": 2.0479582750286225, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9365462064743042, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8618603944778442, + "num_tokens": 614219212.0, + "step": 16099 + }, + { + "epoch": 2.048085485307213, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.1921825408935547, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8385241031646729, + "num_tokens": 614264017.0, + "step": 16100 + }, + { + "epoch": 2.0482126955858035, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.2425546646118164, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8501634001731873, + "num_tokens": 614299031.0, + "step": 16101 + }, + { + "epoch": 2.0483399058643936, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.990134358406067, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8737828135490417, + "num_tokens": 614334387.0, + "step": 16102 + }, + { + "epoch": 2.048467116142984, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9257336854934692, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8685173988342285, + "num_tokens": 614372438.0, + "step": 16103 + }, + { + "epoch": 2.0485943264215747, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.822967767715454, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8626455068588257, + "num_tokens": 614409976.0, + "step": 16104 + }, + { + "epoch": 2.048721536700165, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8918119668960571, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.874973475933075, + "num_tokens": 614448740.0, + "step": 16105 + }, + { + "epoch": 2.0488487469787557, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8795472383499146, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8707961440086365, + "num_tokens": 614484644.0, + "step": 16106 + }, + { + "epoch": 2.0489759572573463, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9605791568756104, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8477965593338013, + "num_tokens": 614521210.0, + "step": 16107 + }, + { + "epoch": 2.049103167535937, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.846101999282837, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8616463541984558, + "num_tokens": 614561783.0, + "step": 16108 + }, + { + "epoch": 2.0492303778145273, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9447487592697144, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8679956197738647, + "num_tokens": 614598643.0, + "step": 16109 + }, + { + "epoch": 2.049357588093118, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.083047389984131, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8668508529663086, + "num_tokens": 614633520.0, + "step": 16110 + }, + { + "epoch": 2.0494847983717084, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.056410789489746, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8541803359985352, + "num_tokens": 614673266.0, + "step": 16111 + }, + { + "epoch": 2.049612008650299, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0705983638763428, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8578367829322815, + "num_tokens": 614707201.0, + "step": 16112 + }, + { + "epoch": 2.0497392189288894, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1636176109313965, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8716920614242554, + "num_tokens": 614746542.0, + "step": 16113 + }, + { + "epoch": 2.04986642920748, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8520495891571045, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8571213483810425, + "num_tokens": 614781804.0, + "step": 16114 + }, + { + "epoch": 2.0499936394860705, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8245779275894165, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.866777777671814, + "num_tokens": 614817281.0, + "step": 16115 + }, + { + "epoch": 2.050120849764661, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.941052794456482, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8572649955749512, + "num_tokens": 614853666.0, + "step": 16116 + }, + { + "epoch": 2.0502480600432516, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8286200761795044, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.868174135684967, + "num_tokens": 614891221.0, + "step": 16117 + }, + { + "epoch": 2.050375270321842, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0501351356506348, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.874313235282898, + "num_tokens": 614925956.0, + "step": 16118 + }, + { + "epoch": 2.0505024806004326, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9833203554153442, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8742015361785889, + "num_tokens": 614959934.0, + "step": 16119 + }, + { + "epoch": 2.050629690879023, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.968822956085205, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8630038499832153, + "num_tokens": 615000579.0, + "step": 16120 + }, + { + "epoch": 2.0507569011576137, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0560340881347656, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8608841896057129, + "num_tokens": 615037371.0, + "step": 16121 + }, + { + "epoch": 2.050884111436204, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9420620203018188, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8566385507583618, + "num_tokens": 615075948.0, + "step": 16122 + }, + { + "epoch": 2.0510113217147947, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7853820323944092, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8815746307373047, + "num_tokens": 615114619.0, + "step": 16123 + }, + { + "epoch": 2.0511385319933853, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8888392448425293, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8671234846115112, + "num_tokens": 615149260.0, + "step": 16124 + }, + { + "epoch": 2.051265742271976, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.358670473098755, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8689754009246826, + "num_tokens": 615183284.0, + "step": 16125 + }, + { + "epoch": 2.0513929525505663, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9549541473388672, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8729662895202637, + "num_tokens": 615220015.0, + "step": 16126 + }, + { + "epoch": 2.0515201628291564, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0464792251586914, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8606559038162231, + "num_tokens": 615255614.0, + "step": 16127 + }, + { + "epoch": 2.051647373107747, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9699909687042236, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8679696917533875, + "num_tokens": 615290232.0, + "step": 16128 + }, + { + "epoch": 2.0517745833863374, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8279904127120972, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8725735545158386, + "num_tokens": 615328204.0, + "step": 16129 + }, + { + "epoch": 2.051901793664928, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9863715171813965, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8581749200820923, + "num_tokens": 615362846.0, + "step": 16130 + }, + { + "epoch": 2.0520290039435185, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7966023683547974, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8661797046661377, + "num_tokens": 615405404.0, + "step": 16131 + }, + { + "epoch": 2.052156214222109, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 3.1335887908935547, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.854276716709137, + "num_tokens": 615439835.0, + "step": 16132 + }, + { + "epoch": 2.0522834245006996, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0275051593780518, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8765008449554443, + "num_tokens": 615477832.0, + "step": 16133 + }, + { + "epoch": 2.05241063477929, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9314863681793213, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8571887612342834, + "num_tokens": 615518089.0, + "step": 16134 + }, + { + "epoch": 2.0525378450578806, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9363020658493042, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8712233304977417, + "num_tokens": 615557286.0, + "step": 16135 + }, + { + "epoch": 2.052665055336471, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8750048875808716, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.871585488319397, + "num_tokens": 615593284.0, + "step": 16136 + }, + { + "epoch": 2.0527922656150617, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0366196632385254, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8629953861236572, + "num_tokens": 615633579.0, + "step": 16137 + }, + { + "epoch": 2.052919475893652, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.642296552658081, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8714233040809631, + "num_tokens": 615680135.0, + "step": 16138 + }, + { + "epoch": 2.0530466861722427, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.1854398250579834, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8542512655258179, + "num_tokens": 615714801.0, + "step": 16139 + }, + { + "epoch": 2.0531738964508333, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8754253387451172, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8599132895469666, + "num_tokens": 615750538.0, + "step": 16140 + }, + { + "epoch": 2.053301106729424, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.822616696357727, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8697793483734131, + "num_tokens": 615789465.0, + "step": 16141 + }, + { + "epoch": 2.0534283170080143, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8219332695007324, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8519018888473511, + "num_tokens": 615832553.0, + "step": 16142 + }, + { + "epoch": 2.053555527286605, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8483587503433228, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.862865686416626, + "num_tokens": 615868341.0, + "step": 16143 + }, + { + "epoch": 2.0536827375651954, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9137581586837769, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8764761686325073, + "num_tokens": 615904324.0, + "step": 16144 + }, + { + "epoch": 2.053809947843786, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.6649922132492065, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8739603161811829, + "num_tokens": 615947656.0, + "step": 16145 + }, + { + "epoch": 2.0539371581223764, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8125032186508179, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8609705567359924, + "num_tokens": 615989050.0, + "step": 16146 + }, + { + "epoch": 2.054064368400967, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8422355651855469, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8709108233451843, + "num_tokens": 616027638.0, + "step": 16147 + }, + { + "epoch": 2.0541915786795575, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9914659261703491, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8750967979431152, + "num_tokens": 616066392.0, + "step": 16148 + }, + { + "epoch": 2.054318788958148, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.78126060962677, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8746414184570312, + "num_tokens": 616108303.0, + "step": 16149 + }, + { + "epoch": 2.0544459992367385, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7472783327102661, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8589662313461304, + "num_tokens": 616154174.0, + "step": 16150 + }, + { + "epoch": 2.0545732095153286, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.5919952392578125, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8634339570999146, + "num_tokens": 616193954.0, + "step": 16151 + }, + { + "epoch": 2.054700419793919, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.823976755142212, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8718882203102112, + "num_tokens": 616232771.0, + "step": 16152 + }, + { + "epoch": 2.0548276300725097, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0521347522735596, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8775652050971985, + "num_tokens": 616265151.0, + "step": 16153 + }, + { + "epoch": 2.0549548403511, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9133967161178589, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8606601357460022, + "num_tokens": 616303886.0, + "step": 16154 + }, + { + "epoch": 2.0550820506296907, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.879721999168396, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8773258924484253, + "num_tokens": 616340040.0, + "step": 16155 + }, + { + "epoch": 2.0552092609082813, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0181562900543213, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8711308240890503, + "num_tokens": 616371329.0, + "step": 16156 + }, + { + "epoch": 2.055336471186872, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.932029128074646, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8798765540122986, + "num_tokens": 616406383.0, + "step": 16157 + }, + { + "epoch": 2.0554636814654623, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0100724697113037, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8536098003387451, + "num_tokens": 616445600.0, + "step": 16158 + }, + { + "epoch": 2.055590891744053, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9514299631118774, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8585189580917358, + "num_tokens": 616485376.0, + "step": 16159 + }, + { + "epoch": 2.0557181020226434, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8392438888549805, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8558950424194336, + "num_tokens": 616527104.0, + "step": 16160 + }, + { + "epoch": 2.055845312301234, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9367238283157349, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8691588044166565, + "num_tokens": 616567557.0, + "step": 16161 + }, + { + "epoch": 2.0559725225798244, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8951761722564697, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.871228814125061, + "num_tokens": 616601612.0, + "step": 16162 + }, + { + "epoch": 2.056099732858415, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8639791011810303, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8643186092376709, + "num_tokens": 616638019.0, + "step": 16163 + }, + { + "epoch": 2.0562269431370055, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0769879817962646, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8656868934631348, + "num_tokens": 616675224.0, + "step": 16164 + }, + { + "epoch": 2.056354153415596, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.981096625328064, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8632681965827942, + "num_tokens": 616713628.0, + "step": 16165 + }, + { + "epoch": 2.0564813636941865, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0337157249450684, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8643249273300171, + "num_tokens": 616755985.0, + "step": 16166 + }, + { + "epoch": 2.056608573972777, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9599391222000122, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.86798495054245, + "num_tokens": 616786564.0, + "step": 16167 + }, + { + "epoch": 2.0567357842513676, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0279836654663086, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.83791184425354, + "num_tokens": 616823285.0, + "step": 16168 + }, + { + "epoch": 2.056862994529958, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9586504697799683, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8752164244651794, + "num_tokens": 616864465.0, + "step": 16169 + }, + { + "epoch": 2.0569902048085487, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8049683570861816, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8568170070648193, + "num_tokens": 616906670.0, + "step": 16170 + }, + { + "epoch": 2.057117415087139, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7670562267303467, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8660770654678345, + "num_tokens": 616947656.0, + "step": 16171 + }, + { + "epoch": 2.0572446253657297, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.071699857711792, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8595410585403442, + "num_tokens": 616980161.0, + "step": 16172 + }, + { + "epoch": 2.0573718356443202, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.04349946975708, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8650258779525757, + "num_tokens": 617015281.0, + "step": 16173 + }, + { + "epoch": 2.0574990459229108, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8536888360977173, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8678155541419983, + "num_tokens": 617054174.0, + "step": 16174 + }, + { + "epoch": 2.057626256201501, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.906533122062683, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8671410083770752, + "num_tokens": 617095160.0, + "step": 16175 + }, + { + "epoch": 2.0577534664800914, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1012303829193115, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8700931668281555, + "num_tokens": 617130781.0, + "step": 16176 + }, + { + "epoch": 2.057880676758682, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.085667848587036, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8647831678390503, + "num_tokens": 617169473.0, + "step": 16177 + }, + { + "epoch": 2.0580078870372724, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.970759391784668, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8593287467956543, + "num_tokens": 617212208.0, + "step": 16178 + }, + { + "epoch": 2.058135097315863, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9511383771896362, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8670642375946045, + "num_tokens": 617246055.0, + "step": 16179 + }, + { + "epoch": 2.0582623075944535, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.889636754989624, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8615278005599976, + "num_tokens": 617285614.0, + "step": 16180 + }, + { + "epoch": 2.058389517873044, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9581794738769531, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8729395866394043, + "num_tokens": 617323401.0, + "step": 16181 + }, + { + "epoch": 2.0585167281516346, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7545521259307861, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8528979420661926, + "num_tokens": 617372550.0, + "step": 16182 + }, + { + "epoch": 2.058643938430225, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8810794353485107, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8710347414016724, + "num_tokens": 617407894.0, + "step": 16183 + }, + { + "epoch": 2.0587711487088156, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9113277196884155, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8581955432891846, + "num_tokens": 617450117.0, + "step": 16184 + }, + { + "epoch": 2.058898358987406, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9032654762268066, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8709569573402405, + "num_tokens": 617485885.0, + "step": 16185 + }, + { + "epoch": 2.0590255692659967, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.035923719406128, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8494113683700562, + "num_tokens": 617520511.0, + "step": 16186 + }, + { + "epoch": 2.059152779544587, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8727937936782837, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8577203154563904, + "num_tokens": 617558288.0, + "step": 16187 + }, + { + "epoch": 2.0592799898231777, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9775947332382202, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8654178977012634, + "num_tokens": 617602991.0, + "step": 16188 + }, + { + "epoch": 2.0594072001017683, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9900144338607788, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8642318248748779, + "num_tokens": 617639629.0, + "step": 16189 + }, + { + "epoch": 2.059534410380359, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9328213930130005, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8600819706916809, + "num_tokens": 617676218.0, + "step": 16190 + }, + { + "epoch": 2.0596616206589493, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8682856559753418, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8594832420349121, + "num_tokens": 617716270.0, + "step": 16191 + }, + { + "epoch": 2.05978883093754, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.82844877243042, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8744595050811768, + "num_tokens": 617755559.0, + "step": 16192 + }, + { + "epoch": 2.0599160412161304, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8737095594406128, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8695192337036133, + "num_tokens": 617793522.0, + "step": 16193 + }, + { + "epoch": 2.060043251494721, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9123146533966064, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8705111742019653, + "num_tokens": 617828005.0, + "step": 16194 + }, + { + "epoch": 2.0601704617733114, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.931773066520691, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8616636991500854, + "num_tokens": 617863824.0, + "step": 16195 + }, + { + "epoch": 2.060297672051902, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.857353687286377, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.86337810754776, + "num_tokens": 617903648.0, + "step": 16196 + }, + { + "epoch": 2.0604248823304925, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7961984872817993, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8620409369468689, + "num_tokens": 617946229.0, + "step": 16197 + }, + { + "epoch": 2.060552092609083, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8372349739074707, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8675274848937988, + "num_tokens": 617990486.0, + "step": 16198 + }, + { + "epoch": 2.0606793028876735, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9486687183380127, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.878816545009613, + "num_tokens": 618031743.0, + "step": 16199 + }, + { + "epoch": 2.0608065131662636, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.2945163249969482, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8630377054214478, + "num_tokens": 618070181.0, + "step": 16200 + }, + { + "epoch": 2.060933723444854, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.903835654258728, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8721202611923218, + "num_tokens": 618106396.0, + "step": 16201 + }, + { + "epoch": 2.0610609337234447, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8328782320022583, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8719157576560974, + "num_tokens": 618148281.0, + "step": 16202 + }, + { + "epoch": 2.061188144002035, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0439698696136475, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8599514961242676, + "num_tokens": 618182721.0, + "step": 16203 + }, + { + "epoch": 2.0613153542806257, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.123279571533203, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8823559880256653, + "num_tokens": 618219497.0, + "step": 16204 + }, + { + "epoch": 2.0614425645592163, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8226746320724487, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8827811479568481, + "num_tokens": 618258643.0, + "step": 16205 + }, + { + "epoch": 2.061569774837807, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.001206159591675, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8806374073028564, + "num_tokens": 618292682.0, + "step": 16206 + }, + { + "epoch": 2.0616969851163973, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0285708904266357, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8514763116836548, + "num_tokens": 618331375.0, + "step": 16207 + }, + { + "epoch": 2.061824195394988, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.91128671169281, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8704020977020264, + "num_tokens": 618367452.0, + "step": 16208 + }, + { + "epoch": 2.0619514056735784, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.2559995651245117, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8642240762710571, + "num_tokens": 618410470.0, + "step": 16209 + }, + { + "epoch": 2.062078615952169, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8476864099502563, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8589479327201843, + "num_tokens": 618447524.0, + "step": 16210 + }, + { + "epoch": 2.0622058262307594, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.008553981781006, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8478842377662659, + "num_tokens": 618481793.0, + "step": 16211 + }, + { + "epoch": 2.06233303650935, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8362151384353638, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8713912963867188, + "num_tokens": 618517097.0, + "step": 16212 + }, + { + "epoch": 2.0624602467879405, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7523655891418457, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8634045720100403, + "num_tokens": 618554221.0, + "step": 16213 + }, + { + "epoch": 2.062587457066531, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8138521909713745, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8770094513893127, + "num_tokens": 618591279.0, + "step": 16214 + }, + { + "epoch": 2.0627146673451215, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0711517333984375, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8541473150253296, + "num_tokens": 618628168.0, + "step": 16215 + }, + { + "epoch": 2.062841877623712, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8173515796661377, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8721544146537781, + "num_tokens": 618664580.0, + "step": 16216 + }, + { + "epoch": 2.0629690879023026, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9804190397262573, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8553719520568848, + "num_tokens": 618699007.0, + "step": 16217 + }, + { + "epoch": 2.063096298180893, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8321940898895264, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8676825761795044, + "num_tokens": 618736630.0, + "step": 16218 + }, + { + "epoch": 2.0632235084594837, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.761563777923584, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8746600151062012, + "num_tokens": 618777868.0, + "step": 16219 + }, + { + "epoch": 2.063350718738074, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7501304149627686, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8699947595596313, + "num_tokens": 618819134.0, + "step": 16220 + }, + { + "epoch": 2.0634779290166647, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.685100793838501, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8651535511016846, + "num_tokens": 618862231.0, + "step": 16221 + }, + { + "epoch": 2.0636051392952552, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7182810306549072, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.859872579574585, + "num_tokens": 618906816.0, + "step": 16222 + }, + { + "epoch": 2.0637323495738458, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.860883116722107, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8663630485534668, + "num_tokens": 618943239.0, + "step": 16223 + }, + { + "epoch": 2.0638595598524363, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8195925951004028, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.879681408405304, + "num_tokens": 618978641.0, + "step": 16224 + }, + { + "epoch": 2.0639867701310264, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.739756464958191, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8679080009460449, + "num_tokens": 619020522.0, + "step": 16225 + }, + { + "epoch": 2.064113980409617, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7662248611450195, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.877948522567749, + "num_tokens": 619060769.0, + "step": 16226 + }, + { + "epoch": 2.0642411906882074, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8053780794143677, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.877225399017334, + "num_tokens": 619098711.0, + "step": 16227 + }, + { + "epoch": 2.064368400966798, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.1050448417663574, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8629645109176636, + "num_tokens": 619133673.0, + "step": 16228 + }, + { + "epoch": 2.0644956112453885, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8526827096939087, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8617420792579651, + "num_tokens": 619174554.0, + "step": 16229 + }, + { + "epoch": 2.064622821523979, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.05633807182312, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8569064140319824, + "num_tokens": 619211095.0, + "step": 16230 + }, + { + "epoch": 2.0647500318025696, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9181572198867798, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8769580125808716, + "num_tokens": 619246346.0, + "step": 16231 + }, + { + "epoch": 2.06487724208116, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9605634212493896, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8653350472450256, + "num_tokens": 619280629.0, + "step": 16232 + }, + { + "epoch": 2.0650044523597506, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8481884002685547, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8466312885284424, + "num_tokens": 619320928.0, + "step": 16233 + }, + { + "epoch": 2.065131662638341, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.747267723083496, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8683443665504456, + "num_tokens": 619369747.0, + "step": 16234 + }, + { + "epoch": 2.0652588729169317, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8707636594772339, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8697713613510132, + "num_tokens": 619408694.0, + "step": 16235 + }, + { + "epoch": 2.065386083195522, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7346587181091309, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8741700649261475, + "num_tokens": 619446858.0, + "step": 16236 + }, + { + "epoch": 2.0655132934741127, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0702357292175293, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8657419085502625, + "num_tokens": 619482281.0, + "step": 16237 + }, + { + "epoch": 2.0656405037527032, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.809217929840088, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8760796785354614, + "num_tokens": 619520309.0, + "step": 16238 + }, + { + "epoch": 2.065767714031294, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8859361410140991, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8699116110801697, + "num_tokens": 619554662.0, + "step": 16239 + }, + { + "epoch": 2.0658949243098843, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8670216798782349, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8620235323905945, + "num_tokens": 619599632.0, + "step": 16240 + }, + { + "epoch": 2.066022134588475, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8153997659683228, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8804019093513489, + "num_tokens": 619639414.0, + "step": 16241 + }, + { + "epoch": 2.0661493448670654, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.931888461112976, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8692039847373962, + "num_tokens": 619675878.0, + "step": 16242 + }, + { + "epoch": 2.066276555145656, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9734786748886108, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8694533705711365, + "num_tokens": 619709486.0, + "step": 16243 + }, + { + "epoch": 2.0664037654242464, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.878116250038147, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8670684099197388, + "num_tokens": 619753576.0, + "step": 16244 + }, + { + "epoch": 2.066530975702837, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0023367404937744, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8666467666625977, + "num_tokens": 619792520.0, + "step": 16245 + }, + { + "epoch": 2.0666581859814275, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.098721981048584, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8457355499267578, + "num_tokens": 619827871.0, + "step": 16246 + }, + { + "epoch": 2.066785396260018, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8297911882400513, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8623366355895996, + "num_tokens": 619864527.0, + "step": 16247 + }, + { + "epoch": 2.0669126065386085, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8184947967529297, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8675037026405334, + "num_tokens": 619904934.0, + "step": 16248 + }, + { + "epoch": 2.0670398168171986, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8173394203186035, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.874862551689148, + "num_tokens": 619945472.0, + "step": 16249 + }, + { + "epoch": 2.067167027095789, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9265046119689941, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8793734312057495, + "num_tokens": 619981738.0, + "step": 16250 + }, + { + "epoch": 2.0672942373743797, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8810858726501465, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8679317235946655, + "num_tokens": 620024593.0, + "step": 16251 + }, + { + "epoch": 2.06742144765297, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8786414861679077, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8727973699569702, + "num_tokens": 620064059.0, + "step": 16252 + }, + { + "epoch": 2.0675486579315607, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8230533599853516, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8659679889678955, + "num_tokens": 620100028.0, + "step": 16253 + }, + { + "epoch": 2.0676758682101513, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8435612916946411, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.879231333732605, + "num_tokens": 620135181.0, + "step": 16254 + }, + { + "epoch": 2.067803078488742, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0313973426818848, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8668422102928162, + "num_tokens": 620174887.0, + "step": 16255 + }, + { + "epoch": 2.0679302887673323, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8605303764343262, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.867828905582428, + "num_tokens": 620212924.0, + "step": 16256 + }, + { + "epoch": 2.068057499045923, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9632868766784668, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8569278120994568, + "num_tokens": 620251088.0, + "step": 16257 + }, + { + "epoch": 2.0681847093245134, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1023852825164795, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8660861849784851, + "num_tokens": 620287888.0, + "step": 16258 + }, + { + "epoch": 2.068311919603104, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9531230926513672, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8661375045776367, + "num_tokens": 620324937.0, + "step": 16259 + }, + { + "epoch": 2.0684391298816944, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9576441049575806, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8679519295692444, + "num_tokens": 620363390.0, + "step": 16260 + }, + { + "epoch": 2.068566340160285, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9245401620864868, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8737007975578308, + "num_tokens": 620403707.0, + "step": 16261 + }, + { + "epoch": 2.0686935504388755, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9963651895523071, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8720857501029968, + "num_tokens": 620445994.0, + "step": 16262 + }, + { + "epoch": 2.068820760717466, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8563776016235352, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8751635551452637, + "num_tokens": 620480883.0, + "step": 16263 + }, + { + "epoch": 2.0689479709960565, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.89279043674469, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8792620301246643, + "num_tokens": 620518935.0, + "step": 16264 + }, + { + "epoch": 2.069075181274647, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0511622428894043, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8608639240264893, + "num_tokens": 620551985.0, + "step": 16265 + }, + { + "epoch": 2.0692023915532376, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.016550064086914, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8622770309448242, + "num_tokens": 620592809.0, + "step": 16266 + }, + { + "epoch": 2.069329601831828, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1385910511016846, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8515057563781738, + "num_tokens": 620627951.0, + "step": 16267 + }, + { + "epoch": 2.0694568121104187, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.986883521080017, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8712064027786255, + "num_tokens": 620660892.0, + "step": 16268 + }, + { + "epoch": 2.069584022389009, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.044130563735962, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8422993421554565, + "num_tokens": 620698165.0, + "step": 16269 + }, + { + "epoch": 2.0697112326675997, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0068633556365967, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8574844002723694, + "num_tokens": 620734758.0, + "step": 16270 + }, + { + "epoch": 2.0698384429461902, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3637399673461914, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8470291495323181, + "num_tokens": 620770224.0, + "step": 16271 + }, + { + "epoch": 2.0699656532247808, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8847607374191284, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8749728202819824, + "num_tokens": 620805724.0, + "step": 16272 + }, + { + "epoch": 2.070092863503371, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8991403579711914, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8548346757888794, + "num_tokens": 620847622.0, + "step": 16273 + }, + { + "epoch": 2.0702200737819614, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.056601047515869, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8644888401031494, + "num_tokens": 620879042.0, + "step": 16274 + }, + { + "epoch": 2.070347284060552, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.765688180923462, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8633029460906982, + "num_tokens": 620922685.0, + "step": 16275 + }, + { + "epoch": 2.0704744943391424, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8764164447784424, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.860442042350769, + "num_tokens": 620954854.0, + "step": 16276 + }, + { + "epoch": 2.070601704617733, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.887160062789917, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8625136017799377, + "num_tokens": 620994581.0, + "step": 16277 + }, + { + "epoch": 2.0707289148963235, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3764255046844482, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8662256598472595, + "num_tokens": 621030052.0, + "step": 16278 + }, + { + "epoch": 2.070856125174914, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7247992753982544, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8753628730773926, + "num_tokens": 621072474.0, + "step": 16279 + }, + { + "epoch": 2.0709833354535045, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0283005237579346, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8720322251319885, + "num_tokens": 621104365.0, + "step": 16280 + }, + { + "epoch": 2.071110545732095, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9324615001678467, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8720147013664246, + "num_tokens": 621140497.0, + "step": 16281 + }, + { + "epoch": 2.0712377560106856, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8478960990905762, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8695914149284363, + "num_tokens": 621177431.0, + "step": 16282 + }, + { + "epoch": 2.071364966289276, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8797858953475952, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8710018396377563, + "num_tokens": 621212874.0, + "step": 16283 + }, + { + "epoch": 2.0714921765678667, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.501883029937744, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8720390796661377, + "num_tokens": 621249927.0, + "step": 16284 + }, + { + "epoch": 2.071619386846457, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1383602619171143, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.843752384185791, + "num_tokens": 621280794.0, + "step": 16285 + }, + { + "epoch": 2.0717465971250477, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9922573566436768, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8504348993301392, + "num_tokens": 621321167.0, + "step": 16286 + }, + { + "epoch": 2.0718738074036382, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0979130268096924, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8779730200767517, + "num_tokens": 621357962.0, + "step": 16287 + }, + { + "epoch": 2.0720010176822288, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7437776327133179, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8844946026802063, + "num_tokens": 621398933.0, + "step": 16288 + }, + { + "epoch": 2.0721282279608193, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.6919304132461548, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8859636783599854, + "num_tokens": 621442334.0, + "step": 16289 + }, + { + "epoch": 2.07225543823941, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7714192867279053, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.870225191116333, + "num_tokens": 621478706.0, + "step": 16290 + }, + { + "epoch": 2.0723826485180004, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0099194049835205, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8740589022636414, + "num_tokens": 621513911.0, + "step": 16291 + }, + { + "epoch": 2.072509858796591, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9177206754684448, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8623085021972656, + "num_tokens": 621554080.0, + "step": 16292 + }, + { + "epoch": 2.0726370690751814, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.847585678100586, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8695250153541565, + "num_tokens": 621592059.0, + "step": 16293 + }, + { + "epoch": 2.072764279353772, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8418967723846436, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8800273537635803, + "num_tokens": 621629626.0, + "step": 16294 + }, + { + "epoch": 2.0728914896323625, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8547261953353882, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8723949193954468, + "num_tokens": 621671341.0, + "step": 16295 + }, + { + "epoch": 2.073018699910953, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8269225358963013, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8822348713874817, + "num_tokens": 621708756.0, + "step": 16296 + }, + { + "epoch": 2.0731459101895435, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8979915380477905, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8722864389419556, + "num_tokens": 621748009.0, + "step": 16297 + }, + { + "epoch": 2.0732731204681336, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9357678890228271, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8755854964256287, + "num_tokens": 621788695.0, + "step": 16298 + }, + { + "epoch": 2.073400330746724, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.90091073513031, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8649091720581055, + "num_tokens": 621828099.0, + "step": 16299 + }, + { + "epoch": 2.0735275410253147, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9800711870193481, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8519251346588135, + "num_tokens": 621867033.0, + "step": 16300 + }, + { + "epoch": 2.073654751303905, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.230803966522217, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8574507236480713, + "num_tokens": 621907435.0, + "step": 16301 + }, + { + "epoch": 2.0737819615824957, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8380142450332642, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8569331169128418, + "num_tokens": 621948995.0, + "step": 16302 + }, + { + "epoch": 2.0739091718610863, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8185676336288452, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8786791563034058, + "num_tokens": 621991109.0, + "step": 16303 + }, + { + "epoch": 2.074036382139677, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9160689115524292, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8721961975097656, + "num_tokens": 622026585.0, + "step": 16304 + }, + { + "epoch": 2.0741635924182673, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.999029278755188, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8768764734268188, + "num_tokens": 622065208.0, + "step": 16305 + }, + { + "epoch": 2.074290802696858, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9556196928024292, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.856121301651001, + "num_tokens": 622104838.0, + "step": 16306 + }, + { + "epoch": 2.0744180129754484, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.222806215286255, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8594491481781006, + "num_tokens": 622136879.0, + "step": 16307 + }, + { + "epoch": 2.074545223254039, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0296924114227295, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8792678117752075, + "num_tokens": 622175938.0, + "step": 16308 + }, + { + "epoch": 2.0746724335326294, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9223765134811401, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8752623796463013, + "num_tokens": 622216663.0, + "step": 16309 + }, + { + "epoch": 2.07479964381122, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0088601112365723, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8757678270339966, + "num_tokens": 622250450.0, + "step": 16310 + }, + { + "epoch": 2.0749268540898105, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8597005605697632, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8602063655853271, + "num_tokens": 622291919.0, + "step": 16311 + }, + { + "epoch": 2.075054064368401, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3401572704315186, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8681772947311401, + "num_tokens": 622325892.0, + "step": 16312 + }, + { + "epoch": 2.0751812746469915, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1444144248962402, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8671126365661621, + "num_tokens": 622356966.0, + "step": 16313 + }, + { + "epoch": 2.075308484925582, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.989621639251709, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8703141212463379, + "num_tokens": 622389931.0, + "step": 16314 + }, + { + "epoch": 2.0754356952041726, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8798086643218994, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8647550344467163, + "num_tokens": 622427817.0, + "step": 16315 + }, + { + "epoch": 2.075562905482763, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8899438381195068, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8576218485832214, + "num_tokens": 622470353.0, + "step": 16316 + }, + { + "epoch": 2.0756901157613536, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7552568912506104, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8842003345489502, + "num_tokens": 622507577.0, + "step": 16317 + }, + { + "epoch": 2.075817326039944, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9675860404968262, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8614177703857422, + "num_tokens": 622544746.0, + "step": 16318 + }, + { + "epoch": 2.0759445363185347, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9849164485931396, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8609384298324585, + "num_tokens": 622583688.0, + "step": 16319 + }, + { + "epoch": 2.0760717465971252, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.8924235105514526, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8677464127540588, + "num_tokens": 622619911.0, + "step": 16320 + }, + { + "epoch": 2.0761989568757158, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9123587608337402, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8760027289390564, + "num_tokens": 622665732.0, + "step": 16321 + }, + { + "epoch": 2.0763261671543063, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.881732702255249, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8750737905502319, + "num_tokens": 622705746.0, + "step": 16322 + }, + { + "epoch": 2.0764533774328964, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.926703929901123, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8677836656570435, + "num_tokens": 622745481.0, + "step": 16323 + }, + { + "epoch": 2.076580587711487, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7968813180923462, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8702948093414307, + "num_tokens": 622786910.0, + "step": 16324 + }, + { + "epoch": 2.0767077979900774, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.940975308418274, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8626712560653687, + "num_tokens": 622826976.0, + "step": 16325 + }, + { + "epoch": 2.076835008268668, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.0322608947753906, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8679901361465454, + "num_tokens": 622858945.0, + "step": 16326 + }, + { + "epoch": 2.0769622185472585, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3709909915924072, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.879513680934906, + "num_tokens": 622893229.0, + "step": 16327 + }, + { + "epoch": 2.077089428825849, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7400717735290527, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8772340416908264, + "num_tokens": 622933701.0, + "step": 16328 + }, + { + "epoch": 2.0772166391044395, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9924285411834717, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.854649007320404, + "num_tokens": 622971226.0, + "step": 16329 + }, + { + "epoch": 2.07734384938303, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9121227264404297, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8636225461959839, + "num_tokens": 623009232.0, + "step": 16330 + }, + { + "epoch": 2.0774710596616206, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.044100522994995, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8606555461883545, + "num_tokens": 623048005.0, + "step": 16331 + }, + { + "epoch": 2.077598269940211, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8359029293060303, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8660092353820801, + "num_tokens": 623088474.0, + "step": 16332 + }, + { + "epoch": 2.0777254802188017, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.88642418384552, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8687381744384766, + "num_tokens": 623123700.0, + "step": 16333 + }, + { + "epoch": 2.077852690497392, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 6.406893253326416, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8713583946228027, + "num_tokens": 623161406.0, + "step": 16334 + }, + { + "epoch": 2.0779799007759827, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.030815839767456, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8581835627555847, + "num_tokens": 623199770.0, + "step": 16335 + }, + { + "epoch": 2.0781071110545732, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8978122472763062, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8708423972129822, + "num_tokens": 623234635.0, + "step": 16336 + }, + { + "epoch": 2.0782343213331638, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.867563009262085, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8690586090087891, + "num_tokens": 623273695.0, + "step": 16337 + }, + { + "epoch": 2.0783615316117543, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8139433860778809, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8517428636550903, + "num_tokens": 623315278.0, + "step": 16338 + }, + { + "epoch": 2.078488741890345, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.911324143409729, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8756850957870483, + "num_tokens": 623351753.0, + "step": 16339 + }, + { + "epoch": 2.0786159521689354, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0117204189300537, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.850005030632019, + "num_tokens": 623387812.0, + "step": 16340 + }, + { + "epoch": 2.078743162447526, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9255999326705933, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8700684309005737, + "num_tokens": 623426784.0, + "step": 16341 + }, + { + "epoch": 2.0788703727261164, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0212721824645996, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.844566822052002, + "num_tokens": 623463362.0, + "step": 16342 + }, + { + "epoch": 2.078997583004707, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1536641120910645, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8599387407302856, + "num_tokens": 623500108.0, + "step": 16343 + }, + { + "epoch": 2.0791247932832975, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.019636869430542, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8637836575508118, + "num_tokens": 623537159.0, + "step": 16344 + }, + { + "epoch": 2.079252003561888, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.417081594467163, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8858367800712585, + "num_tokens": 623576256.0, + "step": 16345 + }, + { + "epoch": 2.0793792138404785, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8956032991409302, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8627451062202454, + "num_tokens": 623612454.0, + "step": 16346 + }, + { + "epoch": 2.0795064241190686, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7311960458755493, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8807286620140076, + "num_tokens": 623654392.0, + "step": 16347 + }, + { + "epoch": 2.079633634397659, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8260294198989868, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8493074178695679, + "num_tokens": 623697350.0, + "step": 16348 + }, + { + "epoch": 2.0797608446762497, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 16.61006736755371, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8704283833503723, + "num_tokens": 623736272.0, + "step": 16349 + }, + { + "epoch": 2.07988805495484, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0668625831604004, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8583246469497681, + "num_tokens": 623781319.0, + "step": 16350 + }, + { + "epoch": 2.0800152652334307, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8282767534255981, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8670268058776855, + "num_tokens": 623821944.0, + "step": 16351 + }, + { + "epoch": 2.0801424755120212, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0108449459075928, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8625180125236511, + "num_tokens": 623858399.0, + "step": 16352 + }, + { + "epoch": 2.0802696857906118, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9307132959365845, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8685610294342041, + "num_tokens": 623897839.0, + "step": 16353 + }, + { + "epoch": 2.0803968960692023, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9390113353729248, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8508500456809998, + "num_tokens": 623933127.0, + "step": 16354 + }, + { + "epoch": 2.080524106347793, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.4006645679473877, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.865763783454895, + "num_tokens": 623972140.0, + "step": 16355 + }, + { + "epoch": 2.0806513166263834, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.000488758087158, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8569061756134033, + "num_tokens": 624008875.0, + "step": 16356 + }, + { + "epoch": 2.080778526904974, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 3.2589683532714844, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8647931814193726, + "num_tokens": 624045162.0, + "step": 16357 + }, + { + "epoch": 2.0809057371835644, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8939749002456665, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.87787926197052, + "num_tokens": 624085275.0, + "step": 16358 + }, + { + "epoch": 2.081032947462155, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.015073776245117, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8644721508026123, + "num_tokens": 624127479.0, + "step": 16359 + }, + { + "epoch": 2.0811601577407455, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8072224855422974, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8708741664886475, + "num_tokens": 624165231.0, + "step": 16360 + }, + { + "epoch": 2.081287368019336, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8401638269424438, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8579187393188477, + "num_tokens": 624205109.0, + "step": 16361 + }, + { + "epoch": 2.0814145782979265, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.913196325302124, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8620697259902954, + "num_tokens": 624241849.0, + "step": 16362 + }, + { + "epoch": 2.081541788576517, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.0428824424743652, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8682949542999268, + "num_tokens": 624274174.0, + "step": 16363 + }, + { + "epoch": 2.0816689988551076, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9767342805862427, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8648641109466553, + "num_tokens": 624311526.0, + "step": 16364 + }, + { + "epoch": 2.081796209133698, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.9872210025787354, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8529070615768433, + "num_tokens": 624346657.0, + "step": 16365 + }, + { + "epoch": 2.0819234194122886, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7700999975204468, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8511884212493896, + "num_tokens": 624386563.0, + "step": 16366 + }, + { + "epoch": 2.082050629690879, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7464499473571777, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8761261701583862, + "num_tokens": 624427299.0, + "step": 16367 + }, + { + "epoch": 2.0821778399694697, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.044921875, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8495262265205383, + "num_tokens": 624461253.0, + "step": 16368 + }, + { + "epoch": 2.0823050502480602, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.498396158218384, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8756333589553833, + "num_tokens": 624498699.0, + "step": 16369 + }, + { + "epoch": 2.0824322605266508, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0354061126708984, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8641353845596313, + "num_tokens": 624534655.0, + "step": 16370 + }, + { + "epoch": 2.082559470805241, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.15560245513916, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8710256814956665, + "num_tokens": 624575342.0, + "step": 16371 + }, + { + "epoch": 2.0826866810838314, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1315810680389404, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8710336089134216, + "num_tokens": 624608493.0, + "step": 16372 + }, + { + "epoch": 2.082813891362422, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7780083417892456, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.847869873046875, + "num_tokens": 624655777.0, + "step": 16373 + }, + { + "epoch": 2.0829411016410124, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.6898818016052246, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8764228820800781, + "num_tokens": 624699349.0, + "step": 16374 + }, + { + "epoch": 2.083068311919603, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0255720615386963, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8738501071929932, + "num_tokens": 624730146.0, + "step": 16375 + }, + { + "epoch": 2.0831955221981935, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8326594829559326, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8578512668609619, + "num_tokens": 624773186.0, + "step": 16376 + }, + { + "epoch": 2.083322732476784, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9303154945373535, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8647753596305847, + "num_tokens": 624810256.0, + "step": 16377 + }, + { + "epoch": 2.0834499427553745, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0217151641845703, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8562699556350708, + "num_tokens": 624848166.0, + "step": 16378 + }, + { + "epoch": 2.083577153033965, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.650740146636963, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8650367856025696, + "num_tokens": 624887601.0, + "step": 16379 + }, + { + "epoch": 2.0837043633125556, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9012001752853394, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8636285662651062, + "num_tokens": 624921160.0, + "step": 16380 + }, + { + "epoch": 2.083831573591146, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9399453401565552, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8584204912185669, + "num_tokens": 624959053.0, + "step": 16381 + }, + { + "epoch": 2.0839587838697367, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8625648021697998, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.880100667476654, + "num_tokens": 624995272.0, + "step": 16382 + }, + { + "epoch": 2.084085994148327, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8050174713134766, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8823485374450684, + "num_tokens": 625033388.0, + "step": 16383 + }, + { + "epoch": 2.0842132044269177, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0794200897216797, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8555203676223755, + "num_tokens": 625071103.0, + "step": 16384 + }, + { + "epoch": 2.0843404147055082, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.743047833442688, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8788065314292908, + "num_tokens": 625109562.0, + "step": 16385 + }, + { + "epoch": 2.0844676249840988, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9456067085266113, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8718806505203247, + "num_tokens": 625147438.0, + "step": 16386 + }, + { + "epoch": 2.0845948352626893, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 80.52003479003906, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8753466606140137, + "num_tokens": 625184293.0, + "step": 16387 + }, + { + "epoch": 2.08472204554128, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2007029056549072, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8682283163070679, + "num_tokens": 625218201.0, + "step": 16388 + }, + { + "epoch": 2.0848492558198704, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1125996112823486, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8651586771011353, + "num_tokens": 625252072.0, + "step": 16389 + }, + { + "epoch": 2.084976466098461, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1331827640533447, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8794783353805542, + "num_tokens": 625286990.0, + "step": 16390 + }, + { + "epoch": 2.0851036763770514, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8323838710784912, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8771376013755798, + "num_tokens": 625326579.0, + "step": 16391 + }, + { + "epoch": 2.085230886655642, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8420549631118774, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8714389204978943, + "num_tokens": 625367317.0, + "step": 16392 + }, + { + "epoch": 2.0853580969342325, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9718809127807617, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.860352635383606, + "num_tokens": 625401047.0, + "step": 16393 + }, + { + "epoch": 2.085485307212823, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1368660926818848, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.851645827293396, + "num_tokens": 625441878.0, + "step": 16394 + }, + { + "epoch": 2.0856125174914135, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9907280206680298, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8640872240066528, + "num_tokens": 625477007.0, + "step": 16395 + }, + { + "epoch": 2.0857397277700036, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.953231692314148, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8780475854873657, + "num_tokens": 625519908.0, + "step": 16396 + }, + { + "epoch": 2.085866938048594, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.111210584640503, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8692835569381714, + "num_tokens": 625554946.0, + "step": 16397 + }, + { + "epoch": 2.0859941483271847, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9408683776855469, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8759709596633911, + "num_tokens": 625592121.0, + "step": 16398 + }, + { + "epoch": 2.086121358605775, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8378379344940186, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.864760160446167, + "num_tokens": 625630596.0, + "step": 16399 + }, + { + "epoch": 2.0862485688843657, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.966722846031189, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.849924623966217, + "num_tokens": 625669535.0, + "step": 16400 + }, + { + "epoch": 2.0863757791629562, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8844414949417114, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8649609088897705, + "num_tokens": 625710072.0, + "step": 16401 + }, + { + "epoch": 2.0865029894415468, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8805253505706787, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8738831877708435, + "num_tokens": 625746328.0, + "step": 16402 + }, + { + "epoch": 2.0866301997201373, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8258949518203735, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8665740489959717, + "num_tokens": 625787108.0, + "step": 16403 + }, + { + "epoch": 2.086757409998728, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7732301950454712, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8705806732177734, + "num_tokens": 625827887.0, + "step": 16404 + }, + { + "epoch": 2.0868846202773184, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8484693765640259, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8667082786560059, + "num_tokens": 625863475.0, + "step": 16405 + }, + { + "epoch": 2.087011830555909, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.779496192932129, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8817828893661499, + "num_tokens": 625902369.0, + "step": 16406 + }, + { + "epoch": 2.0871390408344994, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.888994812965393, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8403745293617249, + "num_tokens": 625941054.0, + "step": 16407 + }, + { + "epoch": 2.08726625111309, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0172736644744873, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8489146828651428, + "num_tokens": 625981113.0, + "step": 16408 + }, + { + "epoch": 2.0873934613916805, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9618903398513794, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8617203831672668, + "num_tokens": 626014758.0, + "step": 16409 + }, + { + "epoch": 2.087520671670271, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.013732671737671, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8513779044151306, + "num_tokens": 626054442.0, + "step": 16410 + }, + { + "epoch": 2.0876478819488615, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.040724515914917, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8477727174758911, + "num_tokens": 626093993.0, + "step": 16411 + }, + { + "epoch": 2.087775092227452, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0410513877868652, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8485486507415771, + "num_tokens": 626131242.0, + "step": 16412 + }, + { + "epoch": 2.0879023025060426, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2350869178771973, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.851961612701416, + "num_tokens": 626162137.0, + "step": 16413 + }, + { + "epoch": 2.088029512784633, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8241389989852905, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8668702840805054, + "num_tokens": 626201031.0, + "step": 16414 + }, + { + "epoch": 2.0881567230632236, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8366509675979614, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8754249811172485, + "num_tokens": 626243647.0, + "step": 16415 + }, + { + "epoch": 2.088283933341814, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.090346574783325, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8715872764587402, + "num_tokens": 626281346.0, + "step": 16416 + }, + { + "epoch": 2.0884111436204047, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.030021905899048, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8759296536445618, + "num_tokens": 626315964.0, + "step": 16417 + }, + { + "epoch": 2.0885383538989952, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8144294023513794, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8591605424880981, + "num_tokens": 626367920.0, + "step": 16418 + }, + { + "epoch": 2.0886655641775858, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.945560097694397, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8629878163337708, + "num_tokens": 626412123.0, + "step": 16419 + }, + { + "epoch": 2.0887927744561763, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9658491611480713, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8376291990280151, + "num_tokens": 626451560.0, + "step": 16420 + }, + { + "epoch": 2.0889199847347664, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9333083629608154, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8600183725357056, + "num_tokens": 626489361.0, + "step": 16421 + }, + { + "epoch": 2.089047195013357, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8585026264190674, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8808479309082031, + "num_tokens": 626527150.0, + "step": 16422 + }, + { + "epoch": 2.0891744052919474, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8106566667556763, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8594838380813599, + "num_tokens": 626568110.0, + "step": 16423 + }, + { + "epoch": 2.089301615570538, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.139768362045288, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8663144111633301, + "num_tokens": 626607215.0, + "step": 16424 + }, + { + "epoch": 2.0894288258491285, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9968807697296143, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8709653615951538, + "num_tokens": 626644424.0, + "step": 16425 + }, + { + "epoch": 2.089556036127719, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.888540506362915, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8444615602493286, + "num_tokens": 626689509.0, + "step": 16426 + }, + { + "epoch": 2.0896832464063095, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1166837215423584, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8545909523963928, + "num_tokens": 626733866.0, + "step": 16427 + }, + { + "epoch": 2.0898104566849, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8921163082122803, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8728182911872864, + "num_tokens": 626767465.0, + "step": 16428 + }, + { + "epoch": 2.0899376669634906, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.830991268157959, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8525285720825195, + "num_tokens": 626806049.0, + "step": 16429 + }, + { + "epoch": 2.090064877242081, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.351917266845703, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8517100811004639, + "num_tokens": 626848917.0, + "step": 16430 + }, + { + "epoch": 2.0901920875206716, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.978903889656067, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8520616888999939, + "num_tokens": 626884442.0, + "step": 16431 + }, + { + "epoch": 2.090319297799262, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9563982486724854, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8626587390899658, + "num_tokens": 626922517.0, + "step": 16432 + }, + { + "epoch": 2.0904465080778527, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7658780813217163, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.862622082233429, + "num_tokens": 626962880.0, + "step": 16433 + }, + { + "epoch": 2.0905737183564432, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7607496976852417, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8727725744247437, + "num_tokens": 627008115.0, + "step": 16434 + }, + { + "epoch": 2.0907009286350338, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8698360919952393, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.876507043838501, + "num_tokens": 627043935.0, + "step": 16435 + }, + { + "epoch": 2.0908281389136243, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.036144971847534, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8657035827636719, + "num_tokens": 627078495.0, + "step": 16436 + }, + { + "epoch": 2.090955349192215, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.529122829437256, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8687881231307983, + "num_tokens": 627116652.0, + "step": 16437 + }, + { + "epoch": 2.0910825594708053, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9372880458831787, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8731991052627563, + "num_tokens": 627152326.0, + "step": 16438 + }, + { + "epoch": 2.091209769749396, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0518860816955566, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8753838539123535, + "num_tokens": 627189502.0, + "step": 16439 + }, + { + "epoch": 2.0913369800279864, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.094295024871826, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8788689374923706, + "num_tokens": 627225438.0, + "step": 16440 + }, + { + "epoch": 2.091464190306577, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0095207691192627, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8621311783790588, + "num_tokens": 627258154.0, + "step": 16441 + }, + { + "epoch": 2.0915914005851675, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8478835821151733, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8583029508590698, + "num_tokens": 627301804.0, + "step": 16442 + }, + { + "epoch": 2.091718610863758, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8963282108306885, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.870198667049408, + "num_tokens": 627344301.0, + "step": 16443 + }, + { + "epoch": 2.0918458211423485, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8979074954986572, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8737246990203857, + "num_tokens": 627381068.0, + "step": 16444 + }, + { + "epoch": 2.0919730314209386, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8399131298065186, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8577347993850708, + "num_tokens": 627423410.0, + "step": 16445 + }, + { + "epoch": 2.092100241699529, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9875699281692505, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8760818243026733, + "num_tokens": 627456593.0, + "step": 16446 + }, + { + "epoch": 2.0922274519781197, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.899082899093628, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8675205111503601, + "num_tokens": 627493742.0, + "step": 16447 + }, + { + "epoch": 2.09235466225671, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9452316761016846, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.869995653629303, + "num_tokens": 627529809.0, + "step": 16448 + }, + { + "epoch": 2.0924818725353007, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.79262113571167, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.868975818157196, + "num_tokens": 627570693.0, + "step": 16449 + }, + { + "epoch": 2.0926090828138912, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9602646827697754, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8743488788604736, + "num_tokens": 627607608.0, + "step": 16450 + }, + { + "epoch": 2.0927362930924818, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.227153778076172, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8636480569839478, + "num_tokens": 627642102.0, + "step": 16451 + }, + { + "epoch": 2.0928635033710723, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.4522619247436523, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8591262102127075, + "num_tokens": 627678838.0, + "step": 16452 + }, + { + "epoch": 2.092990713649663, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8953438997268677, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8704909682273865, + "num_tokens": 627714952.0, + "step": 16453 + }, + { + "epoch": 2.0931179239282534, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3871326446533203, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8567357063293457, + "num_tokens": 627754710.0, + "step": 16454 + }, + { + "epoch": 2.093245134206844, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.017509698867798, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8704144358634949, + "num_tokens": 627791504.0, + "step": 16455 + }, + { + "epoch": 2.0933723444854344, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0188474655151367, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8788242340087891, + "num_tokens": 627829683.0, + "step": 16456 + }, + { + "epoch": 2.093499554764025, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.973095178604126, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8667160272598267, + "num_tokens": 627869826.0, + "step": 16457 + }, + { + "epoch": 2.0936267650426155, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9893759489059448, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8620520830154419, + "num_tokens": 627904165.0, + "step": 16458 + }, + { + "epoch": 2.093753975321206, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0959813594818115, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8708838224411011, + "num_tokens": 627936559.0, + "step": 16459 + }, + { + "epoch": 2.0938811855997965, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.958846092224121, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8716086745262146, + "num_tokens": 627971742.0, + "step": 16460 + }, + { + "epoch": 2.094008395878387, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9101210832595825, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8752410411834717, + "num_tokens": 628013104.0, + "step": 16461 + }, + { + "epoch": 2.0941356061569776, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.024289131164551, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8622097969055176, + "num_tokens": 628049378.0, + "step": 16462 + }, + { + "epoch": 2.094262816435568, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.806843876838684, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8842405080795288, + "num_tokens": 628087213.0, + "step": 16463 + }, + { + "epoch": 2.0943900267141586, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9768427610397339, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8702991008758545, + "num_tokens": 628122060.0, + "step": 16464 + }, + { + "epoch": 2.094517236992749, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.875022053718567, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8690972924232483, + "num_tokens": 628166554.0, + "step": 16465 + }, + { + "epoch": 2.0946444472713397, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3151347637176514, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8651266098022461, + "num_tokens": 628199953.0, + "step": 16466 + }, + { + "epoch": 2.09477165754993, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.908744215965271, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8684929609298706, + "num_tokens": 628237044.0, + "step": 16467 + }, + { + "epoch": 2.0948988678285207, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.02341628074646, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8697463870048523, + "num_tokens": 628275773.0, + "step": 16468 + }, + { + "epoch": 2.095026078107111, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.016591787338257, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8498786687850952, + "num_tokens": 628314549.0, + "step": 16469 + }, + { + "epoch": 2.0951532883857014, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1380929946899414, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8600391149520874, + "num_tokens": 628349417.0, + "step": 16470 + }, + { + "epoch": 2.095280498664292, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.059237241744995, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8711466193199158, + "num_tokens": 628384435.0, + "step": 16471 + }, + { + "epoch": 2.0954077089428824, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0907046794891357, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8532162308692932, + "num_tokens": 628424124.0, + "step": 16472 + }, + { + "epoch": 2.095534919221473, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.001056432723999, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8694829940795898, + "num_tokens": 628462412.0, + "step": 16473 + }, + { + "epoch": 2.0956621295000635, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.208657741546631, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8482951521873474, + "num_tokens": 628499066.0, + "step": 16474 + }, + { + "epoch": 2.095789339778654, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9670568704605103, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8808021545410156, + "num_tokens": 628532379.0, + "step": 16475 + }, + { + "epoch": 2.0959165500572445, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.851236343383789, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.874930739402771, + "num_tokens": 628570497.0, + "step": 16476 + }, + { + "epoch": 2.096043760335835, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8818607330322266, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8701192140579224, + "num_tokens": 628608298.0, + "step": 16477 + }, + { + "epoch": 2.0961709706144256, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9857414960861206, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8843735456466675, + "num_tokens": 628641217.0, + "step": 16478 + }, + { + "epoch": 2.096298180893016, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8428531885147095, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8673580884933472, + "num_tokens": 628682480.0, + "step": 16479 + }, + { + "epoch": 2.0964253911716066, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8330354690551758, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8633394241333008, + "num_tokens": 628718359.0, + "step": 16480 + }, + { + "epoch": 2.096552601450197, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.985018014907837, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8739023208618164, + "num_tokens": 628752214.0, + "step": 16481 + }, + { + "epoch": 2.0966798117287877, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.949253797531128, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8792763948440552, + "num_tokens": 628787882.0, + "step": 16482 + }, + { + "epoch": 2.0968070220073782, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9994025230407715, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8627715706825256, + "num_tokens": 628824910.0, + "step": 16483 + }, + { + "epoch": 2.0969342322859688, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.2477314472198486, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8792508244514465, + "num_tokens": 628860657.0, + "step": 16484 + }, + { + "epoch": 2.0970614425645593, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0432658195495605, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8496146202087402, + "num_tokens": 628900619.0, + "step": 16485 + }, + { + "epoch": 2.09718865284315, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7867693901062012, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8782819509506226, + "num_tokens": 628944304.0, + "step": 16486 + }, + { + "epoch": 2.0973158631217403, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.967673897743225, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8694819211959839, + "num_tokens": 628980490.0, + "step": 16487 + }, + { + "epoch": 2.097443073400331, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9573147296905518, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8713569045066833, + "num_tokens": 629011924.0, + "step": 16488 + }, + { + "epoch": 2.0975702836789214, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9133234024047852, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8424062132835388, + "num_tokens": 629049514.0, + "step": 16489 + }, + { + "epoch": 2.097697493957512, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.900647759437561, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8838076591491699, + "num_tokens": 629085374.0, + "step": 16490 + }, + { + "epoch": 2.0978247042361025, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8123780488967896, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8630231618881226, + "num_tokens": 629126307.0, + "step": 16491 + }, + { + "epoch": 2.097951914514693, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 4.022903919219971, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8672990202903748, + "num_tokens": 629159693.0, + "step": 16492 + }, + { + "epoch": 2.0980791247932835, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9897774457931519, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8722381591796875, + "num_tokens": 629198544.0, + "step": 16493 + }, + { + "epoch": 2.0982063350718736, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8690482378005981, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8593085408210754, + "num_tokens": 629240956.0, + "step": 16494 + }, + { + "epoch": 2.098333545350464, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9200330972671509, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8676982522010803, + "num_tokens": 629276196.0, + "step": 16495 + }, + { + "epoch": 2.0984607556290547, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1009624004364014, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8531476259231567, + "num_tokens": 629311398.0, + "step": 16496 + }, + { + "epoch": 2.098587965907645, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8479347229003906, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8608870506286621, + "num_tokens": 629349488.0, + "step": 16497 + }, + { + "epoch": 2.0987151761862357, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8319263458251953, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8806244730949402, + "num_tokens": 629388023.0, + "step": 16498 + }, + { + "epoch": 2.0988423864648262, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8676667213439941, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.867039680480957, + "num_tokens": 629432140.0, + "step": 16499 + }, + { + "epoch": 2.0989695967434168, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.6533782482147217, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8531787395477295, + "num_tokens": 629480690.0, + "step": 16500 + }, + { + "epoch": 2.0990968070220073, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8800960779190063, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8832762837409973, + "num_tokens": 629522500.0, + "step": 16501 + }, + { + "epoch": 2.099224017300598, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.193514108657837, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8684433698654175, + "num_tokens": 629563450.0, + "step": 16502 + }, + { + "epoch": 2.0993512275791884, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9041857719421387, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8763657212257385, + "num_tokens": 629603753.0, + "step": 16503 + }, + { + "epoch": 2.099478437857779, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.956852912902832, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8693802356719971, + "num_tokens": 629637828.0, + "step": 16504 + }, + { + "epoch": 2.0996056481363694, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.7695631980895996, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8735130429267883, + "num_tokens": 629674636.0, + "step": 16505 + }, + { + "epoch": 2.09973285841496, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8998346328735352, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8570911884307861, + "num_tokens": 629717046.0, + "step": 16506 + }, + { + "epoch": 2.0998600686935505, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.2380430698394775, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8766486644744873, + "num_tokens": 629750331.0, + "step": 16507 + }, + { + "epoch": 2.099987278972141, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8975800275802612, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.848097026348114, + "num_tokens": 629790095.0, + "step": 16508 + }, + { + "epoch": 2.1001144892507315, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9762969017028809, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8813329935073853, + "num_tokens": 629823035.0, + "step": 16509 + }, + { + "epoch": 2.100241699529322, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3482255935668945, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8630549311637878, + "num_tokens": 629857096.0, + "step": 16510 + }, + { + "epoch": 2.1003689098079126, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.2261295318603516, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8642885684967041, + "num_tokens": 629893448.0, + "step": 16511 + }, + { + "epoch": 2.100496120086503, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.059971570968628, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8646360039710999, + "num_tokens": 629931301.0, + "step": 16512 + }, + { + "epoch": 2.1006233303650936, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.8385868072509766, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8738797903060913, + "num_tokens": 629963481.0, + "step": 16513 + }, + { + "epoch": 2.100750540643684, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9951249361038208, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8624561429023743, + "num_tokens": 630002739.0, + "step": 16514 + }, + { + "epoch": 2.1008777509222747, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9215891361236572, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8597999811172485, + "num_tokens": 630041206.0, + "step": 16515 + }, + { + "epoch": 2.101004961200865, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.909252643585205, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8527840375900269, + "num_tokens": 630083811.0, + "step": 16516 + }, + { + "epoch": 2.1011321714794557, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9588960409164429, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8629934191703796, + "num_tokens": 630119823.0, + "step": 16517 + }, + { + "epoch": 2.1012593817580463, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9003028869628906, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8669911026954651, + "num_tokens": 630163807.0, + "step": 16518 + }, + { + "epoch": 2.1013865920366364, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9736956357955933, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8617343306541443, + "num_tokens": 630203038.0, + "step": 16519 + }, + { + "epoch": 2.101513802315227, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1500401496887207, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8699250221252441, + "num_tokens": 630238505.0, + "step": 16520 + }, + { + "epoch": 2.1016410125938174, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9165613651275635, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8630544543266296, + "num_tokens": 630275279.0, + "step": 16521 + }, + { + "epoch": 2.101768222872408, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8018449544906616, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8636940717697144, + "num_tokens": 630317288.0, + "step": 16522 + }, + { + "epoch": 2.1018954331509985, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.900027871131897, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.867828905582428, + "num_tokens": 630352786.0, + "step": 16523 + }, + { + "epoch": 2.102022643429589, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8503682613372803, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8667445182800293, + "num_tokens": 630391138.0, + "step": 16524 + }, + { + "epoch": 2.1021498537081795, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9622374773025513, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8531860113143921, + "num_tokens": 630426567.0, + "step": 16525 + }, + { + "epoch": 2.10227706398677, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3026514053344727, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8556649088859558, + "num_tokens": 630467308.0, + "step": 16526 + }, + { + "epoch": 2.1024042742653606, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.284538984298706, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.87687087059021, + "num_tokens": 630503333.0, + "step": 16527 + }, + { + "epoch": 2.102531484543951, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.3628971576690674, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8618507385253906, + "num_tokens": 630536838.0, + "step": 16528 + }, + { + "epoch": 2.1026586948225416, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9894883632659912, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8800120949745178, + "num_tokens": 630570833.0, + "step": 16529 + }, + { + "epoch": 2.102785905101132, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.740105152130127, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8749191761016846, + "num_tokens": 630615043.0, + "step": 16530 + }, + { + "epoch": 2.1029131153797227, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.816141128540039, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8885778188705444, + "num_tokens": 630651368.0, + "step": 16531 + }, + { + "epoch": 2.1030403256583132, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8554368019104004, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8586211800575256, + "num_tokens": 630690341.0, + "step": 16532 + }, + { + "epoch": 2.1031675359369038, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9330332279205322, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8633593320846558, + "num_tokens": 630727168.0, + "step": 16533 + }, + { + "epoch": 2.1032947462154943, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8416494131088257, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8713027238845825, + "num_tokens": 630766737.0, + "step": 16534 + }, + { + "epoch": 2.103421956494085, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0320379734039307, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.863475501537323, + "num_tokens": 630803721.0, + "step": 16535 + }, + { + "epoch": 2.1035491667726753, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.810765266418457, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8718306422233582, + "num_tokens": 630843697.0, + "step": 16536 + }, + { + "epoch": 2.103676377051266, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0694916248321533, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8673954010009766, + "num_tokens": 630874325.0, + "step": 16537 + }, + { + "epoch": 2.1038035873298564, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9169769287109375, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8633835911750793, + "num_tokens": 630911456.0, + "step": 16538 + }, + { + "epoch": 2.103930797608447, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.057359457015991, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8654022216796875, + "num_tokens": 630954418.0, + "step": 16539 + }, + { + "epoch": 2.1040580078870375, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0780303478240967, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8657665848731995, + "num_tokens": 630991826.0, + "step": 16540 + }, + { + "epoch": 2.104185218165628, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3075199127197266, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8679649829864502, + "num_tokens": 631025831.0, + "step": 16541 + }, + { + "epoch": 2.1043124284442185, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.994130253791809, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8534123301506042, + "num_tokens": 631062896.0, + "step": 16542 + }, + { + "epoch": 2.1044396387228086, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9555214643478394, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8720498085021973, + "num_tokens": 631099291.0, + "step": 16543 + }, + { + "epoch": 2.104566849001399, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8861680030822754, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.857171893119812, + "num_tokens": 631141570.0, + "step": 16544 + }, + { + "epoch": 2.1046940592799896, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9511770009994507, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.860220193862915, + "num_tokens": 631178882.0, + "step": 16545 + }, + { + "epoch": 2.10482126955858, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.7537543773651123, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8634437322616577, + "num_tokens": 631218059.0, + "step": 16546 + }, + { + "epoch": 2.1049484798371707, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.942092776298523, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.868728518486023, + "num_tokens": 631252827.0, + "step": 16547 + }, + { + "epoch": 2.1050756901157612, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8373949527740479, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8733810186386108, + "num_tokens": 631294370.0, + "step": 16548 + }, + { + "epoch": 2.1052029003943518, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8192898035049438, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.862629771232605, + "num_tokens": 631331091.0, + "step": 16549 + }, + { + "epoch": 2.1053301106729423, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.267503023147583, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8699067831039429, + "num_tokens": 631367285.0, + "step": 16550 + }, + { + "epoch": 2.105457320951533, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9867889881134033, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.853851318359375, + "num_tokens": 631402471.0, + "step": 16551 + }, + { + "epoch": 2.1055845312301233, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9696546792984009, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8654853105545044, + "num_tokens": 631439706.0, + "step": 16552 + }, + { + "epoch": 2.105711741508714, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.94167160987854, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8717026710510254, + "num_tokens": 631471437.0, + "step": 16553 + }, + { + "epoch": 2.1058389517873044, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9492406845092773, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8622764348983765, + "num_tokens": 631515028.0, + "step": 16554 + }, + { + "epoch": 2.105966162065895, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9019302129745483, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8613556623458862, + "num_tokens": 631552946.0, + "step": 16555 + }, + { + "epoch": 2.1060933723444855, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1265058517456055, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8709802627563477, + "num_tokens": 631596045.0, + "step": 16556 + }, + { + "epoch": 2.106220582623076, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8689966201782227, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8749204874038696, + "num_tokens": 631630233.0, + "step": 16557 + }, + { + "epoch": 2.1063477929016665, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 4.248361110687256, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8719215393066406, + "num_tokens": 631666665.0, + "step": 16558 + }, + { + "epoch": 2.106475003180257, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9599590301513672, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8629417419433594, + "num_tokens": 631709536.0, + "step": 16559 + }, + { + "epoch": 2.1066022134588476, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0199389457702637, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8809337019920349, + "num_tokens": 631744333.0, + "step": 16560 + }, + { + "epoch": 2.106729423737438, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8829911947250366, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8512621521949768, + "num_tokens": 631783522.0, + "step": 16561 + }, + { + "epoch": 2.1068566340160286, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8275997638702393, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8500015735626221, + "num_tokens": 631822579.0, + "step": 16562 + }, + { + "epoch": 2.106983844294619, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9516432285308838, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8750449419021606, + "num_tokens": 631862642.0, + "step": 16563 + }, + { + "epoch": 2.1071110545732097, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8495607376098633, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8767475485801697, + "num_tokens": 631903161.0, + "step": 16564 + }, + { + "epoch": 2.1072382648518, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0756912231445312, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.869127631187439, + "num_tokens": 631938149.0, + "step": 16565 + }, + { + "epoch": 2.1073654751303907, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8694480657577515, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8672523498535156, + "num_tokens": 631974717.0, + "step": 16566 + }, + { + "epoch": 2.107492685408981, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9658446311950684, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8707386255264282, + "num_tokens": 632011147.0, + "step": 16567 + }, + { + "epoch": 2.1076198956875714, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9479620456695557, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8727191686630249, + "num_tokens": 632051915.0, + "step": 16568 + }, + { + "epoch": 2.107747105966162, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.2082135677337646, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8574668169021606, + "num_tokens": 632083047.0, + "step": 16569 + }, + { + "epoch": 2.1078743162447524, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9525045156478882, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8783714175224304, + "num_tokens": 632119250.0, + "step": 16570 + }, + { + "epoch": 2.108001526523343, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1983253955841064, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8718210458755493, + "num_tokens": 632152696.0, + "step": 16571 + }, + { + "epoch": 2.1081287368019335, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8745571374893188, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8870921730995178, + "num_tokens": 632188167.0, + "step": 16572 + }, + { + "epoch": 2.108255947080524, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0259454250335693, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8638138771057129, + "num_tokens": 632229567.0, + "step": 16573 + }, + { + "epoch": 2.1083831573591145, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9238935708999634, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8775461316108704, + "num_tokens": 632266870.0, + "step": 16574 + }, + { + "epoch": 2.108510367637705, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9637049436569214, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8664534091949463, + "num_tokens": 632303844.0, + "step": 16575 + }, + { + "epoch": 2.1086375779162956, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.04054594039917, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8676431179046631, + "num_tokens": 632339343.0, + "step": 16576 + }, + { + "epoch": 2.108764788194886, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9378507137298584, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8704239130020142, + "num_tokens": 632385516.0, + "step": 16577 + }, + { + "epoch": 2.1088919984734766, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0023880004882812, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8713158369064331, + "num_tokens": 632420664.0, + "step": 16578 + }, + { + "epoch": 2.109019208752067, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9125521183013916, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8725361824035645, + "num_tokens": 632458082.0, + "step": 16579 + }, + { + "epoch": 2.1091464190306577, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0452091693878174, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8550592660903931, + "num_tokens": 632495963.0, + "step": 16580 + }, + { + "epoch": 2.109273629309248, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3681752681732178, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8678531050682068, + "num_tokens": 632529968.0, + "step": 16581 + }, + { + "epoch": 2.1094008395878387, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9682379961013794, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8743484020233154, + "num_tokens": 632567527.0, + "step": 16582 + }, + { + "epoch": 2.1095280498664293, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8512523174285889, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.865354597568512, + "num_tokens": 632606192.0, + "step": 16583 + }, + { + "epoch": 2.10965526014502, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0034868717193604, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8779178261756897, + "num_tokens": 632645940.0, + "step": 16584 + }, + { + "epoch": 2.1097824704236103, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8863232135772705, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8753993511199951, + "num_tokens": 632689385.0, + "step": 16585 + }, + { + "epoch": 2.109909680702201, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.371410608291626, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8723961710929871, + "num_tokens": 632720178.0, + "step": 16586 + }, + { + "epoch": 2.1100368909807914, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.028903007507324, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8594421148300171, + "num_tokens": 632763249.0, + "step": 16587 + }, + { + "epoch": 2.110164101259382, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9581043720245361, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.849108874797821, + "num_tokens": 632803293.0, + "step": 16588 + }, + { + "epoch": 2.1102913115379724, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1929194927215576, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8585364818572998, + "num_tokens": 632838731.0, + "step": 16589 + }, + { + "epoch": 2.110418521816563, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9922175407409668, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8570704460144043, + "num_tokens": 632875044.0, + "step": 16590 + }, + { + "epoch": 2.1105457320951535, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.987086534500122, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8463003635406494, + "num_tokens": 632911330.0, + "step": 16591 + }, + { + "epoch": 2.1106729423737436, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.942140817642212, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8569333553314209, + "num_tokens": 632952122.0, + "step": 16592 + }, + { + "epoch": 2.110800152652334, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.892441749572754, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8491874933242798, + "num_tokens": 632985602.0, + "step": 16593 + }, + { + "epoch": 2.1109273629309246, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9996567964553833, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8671261072158813, + "num_tokens": 633023792.0, + "step": 16594 + }, + { + "epoch": 2.111054573209515, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9045817852020264, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8685855865478516, + "num_tokens": 633065457.0, + "step": 16595 + }, + { + "epoch": 2.1111817834881057, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0166773796081543, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8738515973091125, + "num_tokens": 633101402.0, + "step": 16596 + }, + { + "epoch": 2.1113089937666962, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8697171211242676, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8583660125732422, + "num_tokens": 633147002.0, + "step": 16597 + }, + { + "epoch": 2.1114362040452868, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9841316938400269, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8662056922912598, + "num_tokens": 633184209.0, + "step": 16598 + }, + { + "epoch": 2.1115634143238773, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0324628353118896, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8703683614730835, + "num_tokens": 633224525.0, + "step": 16599 + }, + { + "epoch": 2.111690624602468, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.22900128364563, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8800657391548157, + "num_tokens": 633253171.0, + "step": 16600 + }, + { + "epoch": 2.1118178348810583, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8388524055480957, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8645955324172974, + "num_tokens": 633294326.0, + "step": 16601 + }, + { + "epoch": 2.111945045159649, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8735066652297974, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8818281292915344, + "num_tokens": 633327236.0, + "step": 16602 + }, + { + "epoch": 2.1120722554382394, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8973599672317505, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8770564794540405, + "num_tokens": 633368593.0, + "step": 16603 + }, + { + "epoch": 2.11219946571683, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3155808448791504, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8456510901451111, + "num_tokens": 633402598.0, + "step": 16604 + }, + { + "epoch": 2.1123266759954205, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.12971568107605, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8614451289176941, + "num_tokens": 633437069.0, + "step": 16605 + }, + { + "epoch": 2.112453886274011, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8328580856323242, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8683268427848816, + "num_tokens": 633479752.0, + "step": 16606 + }, + { + "epoch": 2.1125810965526015, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9723529815673828, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.863109827041626, + "num_tokens": 633516949.0, + "step": 16607 + }, + { + "epoch": 2.112708306831192, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.0095386505126953, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8701615333557129, + "num_tokens": 633547511.0, + "step": 16608 + }, + { + "epoch": 2.1128355171097826, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.2028331756591797, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8687472939491272, + "num_tokens": 633583940.0, + "step": 16609 + }, + { + "epoch": 2.112962727388373, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9367914199829102, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.855523943901062, + "num_tokens": 633626245.0, + "step": 16610 + }, + { + "epoch": 2.1130899376669636, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.886823296546936, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8692276477813721, + "num_tokens": 633662889.0, + "step": 16611 + }, + { + "epoch": 2.113217147945554, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8997328281402588, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8630669116973877, + "num_tokens": 633701461.0, + "step": 16612 + }, + { + "epoch": 2.1133443582241447, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.863362193107605, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8678727746009827, + "num_tokens": 633741302.0, + "step": 16613 + }, + { + "epoch": 2.113471568502735, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.074552297592163, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8715084195137024, + "num_tokens": 633780337.0, + "step": 16614 + }, + { + "epoch": 2.1135987787813257, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.984115481376648, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.872947633266449, + "num_tokens": 633814450.0, + "step": 16615 + }, + { + "epoch": 2.1137259890599163, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.9742895364761353, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8597017526626587, + "num_tokens": 633856252.0, + "step": 16616 + }, + { + "epoch": 2.1138531993385064, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9583076238632202, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.854589581489563, + "num_tokens": 633895235.0, + "step": 16617 + }, + { + "epoch": 2.113980409617097, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8273042440414429, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8683244585990906, + "num_tokens": 633938948.0, + "step": 16618 + }, + { + "epoch": 2.1141076198956874, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8414653539657593, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8754192590713501, + "num_tokens": 633973393.0, + "step": 16619 + }, + { + "epoch": 2.114234830174278, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8962374925613403, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8722972869873047, + "num_tokens": 634006714.0, + "step": 16620 + }, + { + "epoch": 2.1143620404528685, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.752894639968872, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.872673511505127, + "num_tokens": 634046104.0, + "step": 16621 + }, + { + "epoch": 2.114489250731459, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.8479756116867065, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.859033465385437, + "num_tokens": 634086401.0, + "step": 16622 + }, + { + "epoch": 2.1146164610100495, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9171135425567627, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8584843277931213, + "num_tokens": 634125479.0, + "step": 16623 + }, + { + "epoch": 2.11474367128864, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1564223766326904, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8477653861045837, + "num_tokens": 634163873.0, + "step": 16624 + }, + { + "epoch": 2.1148708815672306, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7852692604064941, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8647887110710144, + "num_tokens": 634200533.0, + "step": 16625 + }, + { + "epoch": 2.114998091845821, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.995689034461975, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8595346212387085, + "num_tokens": 634235891.0, + "step": 16626 + }, + { + "epoch": 2.1151253021244116, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0111582279205322, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8668814897537231, + "num_tokens": 634266748.0, + "step": 16627 + }, + { + "epoch": 2.115252512403002, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9718233346939087, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8526874780654907, + "num_tokens": 634302603.0, + "step": 16628 + }, + { + "epoch": 2.1153797226815927, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8020943403244019, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8677940368652344, + "num_tokens": 634345471.0, + "step": 16629 + }, + { + "epoch": 2.115506932960183, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8946731090545654, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.866254448890686, + "num_tokens": 634383902.0, + "step": 16630 + }, + { + "epoch": 2.1156341432387737, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9223960638046265, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8586185574531555, + "num_tokens": 634421082.0, + "step": 16631 + }, + { + "epoch": 2.1157613535173643, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9814203977584839, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.861117959022522, + "num_tokens": 634456490.0, + "step": 16632 + }, + { + "epoch": 2.115888563795955, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.937686562538147, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8475355505943298, + "num_tokens": 634498252.0, + "step": 16633 + }, + { + "epoch": 2.1160157740745453, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0856223106384277, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8749333024024963, + "num_tokens": 634540769.0, + "step": 16634 + }, + { + "epoch": 2.116142984353136, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1159462928771973, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8739340305328369, + "num_tokens": 634573807.0, + "step": 16635 + }, + { + "epoch": 2.1162701946317264, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9140297174453735, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.861768901348114, + "num_tokens": 634609542.0, + "step": 16636 + }, + { + "epoch": 2.116397404910317, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.791748285293579, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8734700083732605, + "num_tokens": 634648641.0, + "step": 16637 + }, + { + "epoch": 2.1165246151889074, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.941846489906311, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8639829158782959, + "num_tokens": 634685531.0, + "step": 16638 + }, + { + "epoch": 2.116651825467498, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2064003944396973, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8691527247428894, + "num_tokens": 634717239.0, + "step": 16639 + }, + { + "epoch": 2.116779035746088, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9487296342849731, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8640360832214355, + "num_tokens": 634750058.0, + "step": 16640 + }, + { + "epoch": 2.1169062460246786, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7639377117156982, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8599773645401001, + "num_tokens": 634793480.0, + "step": 16641 + }, + { + "epoch": 2.117033456303269, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.065591335296631, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.857295036315918, + "num_tokens": 634829923.0, + "step": 16642 + }, + { + "epoch": 2.1171606665818596, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8838990926742554, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8600763082504272, + "num_tokens": 634868499.0, + "step": 16643 + }, + { + "epoch": 2.11728787686045, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9287614822387695, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8750948905944824, + "num_tokens": 634905922.0, + "step": 16644 + }, + { + "epoch": 2.1174150871390407, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.039989948272705, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8679433465003967, + "num_tokens": 634946190.0, + "step": 16645 + }, + { + "epoch": 2.1175422974176312, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0087952613830566, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8781517744064331, + "num_tokens": 634981840.0, + "step": 16646 + }, + { + "epoch": 2.1176695076962218, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.023005247116089, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8640412092208862, + "num_tokens": 635024755.0, + "step": 16647 + }, + { + "epoch": 2.1177967179748123, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8781356811523438, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.87044358253479, + "num_tokens": 635063076.0, + "step": 16648 + }, + { + "epoch": 2.117923928253403, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9407517910003662, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8787035942077637, + "num_tokens": 635103052.0, + "step": 16649 + }, + { + "epoch": 2.1180511385319933, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0883631706237793, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8753898739814758, + "num_tokens": 635140917.0, + "step": 16650 + }, + { + "epoch": 2.118178348810584, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8826757669448853, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8702301979064941, + "num_tokens": 635182049.0, + "step": 16651 + }, + { + "epoch": 2.1183055590891744, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9844772815704346, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8593443632125854, + "num_tokens": 635217613.0, + "step": 16652 + }, + { + "epoch": 2.118432769367765, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8375617265701294, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8755542039871216, + "num_tokens": 635252716.0, + "step": 16653 + }, + { + "epoch": 2.1185599796463555, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.919144630432129, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8689486384391785, + "num_tokens": 635288919.0, + "step": 16654 + }, + { + "epoch": 2.118687189924946, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.886211633682251, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.861007571220398, + "num_tokens": 635329781.0, + "step": 16655 + }, + { + "epoch": 2.1188144002035365, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9711120128631592, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8752598762512207, + "num_tokens": 635363844.0, + "step": 16656 + }, + { + "epoch": 2.118941610482127, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.048936128616333, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8627218008041382, + "num_tokens": 635397568.0, + "step": 16657 + }, + { + "epoch": 2.1190688207607176, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.058537721633911, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8523047566413879, + "num_tokens": 635432903.0, + "step": 16658 + }, + { + "epoch": 2.119196031039308, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9148231744766235, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8679460883140564, + "num_tokens": 635473388.0, + "step": 16659 + }, + { + "epoch": 2.1193232413178986, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.789918065071106, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8693421483039856, + "num_tokens": 635512966.0, + "step": 16660 + }, + { + "epoch": 2.119450451596489, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9873003959655762, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8529303073883057, + "num_tokens": 635547814.0, + "step": 16661 + }, + { + "epoch": 2.1195776618750797, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.106083869934082, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8572914600372314, + "num_tokens": 635583120.0, + "step": 16662 + }, + { + "epoch": 2.11970487215367, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0241425037384033, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8599135279655457, + "num_tokens": 635621531.0, + "step": 16663 + }, + { + "epoch": 2.1198320824322607, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0488059520721436, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8538517951965332, + "num_tokens": 635659641.0, + "step": 16664 + }, + { + "epoch": 2.119959292710851, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9499303102493286, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8613218665122986, + "num_tokens": 635697953.0, + "step": 16665 + }, + { + "epoch": 2.1200865029894413, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8678110837936401, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8420716524124146, + "num_tokens": 635742519.0, + "step": 16666 + }, + { + "epoch": 2.120213713268032, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.859548568725586, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8738890886306763, + "num_tokens": 635777786.0, + "step": 16667 + }, + { + "epoch": 2.1203409235466224, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8422199487686157, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8670079708099365, + "num_tokens": 635814490.0, + "step": 16668 + }, + { + "epoch": 2.120468133825213, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.79680597782135, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8529679179191589, + "num_tokens": 635858351.0, + "step": 16669 + }, + { + "epoch": 2.1205953441038035, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1088507175445557, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.862990140914917, + "num_tokens": 635895585.0, + "step": 16670 + }, + { + "epoch": 2.120722554382394, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9268540143966675, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8731307983398438, + "num_tokens": 635933591.0, + "step": 16671 + }, + { + "epoch": 2.1208497646609845, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9532160758972168, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8634597063064575, + "num_tokens": 635972040.0, + "step": 16672 + }, + { + "epoch": 2.120976974939575, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9077526330947876, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8692054152488708, + "num_tokens": 636013287.0, + "step": 16673 + }, + { + "epoch": 2.1211041852181656, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9942234754562378, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8619595766067505, + "num_tokens": 636046137.0, + "step": 16674 + }, + { + "epoch": 2.121231395496756, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1285195350646973, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8560804724693298, + "num_tokens": 636088870.0, + "step": 16675 + }, + { + "epoch": 2.1213586057753466, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8767833709716797, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8757150769233704, + "num_tokens": 636130519.0, + "step": 16676 + }, + { + "epoch": 2.121485816053937, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9733734130859375, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8701885342597961, + "num_tokens": 636165641.0, + "step": 16677 + }, + { + "epoch": 2.1216130263325277, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.924276351928711, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8521745204925537, + "num_tokens": 636203811.0, + "step": 16678 + }, + { + "epoch": 2.121740236611118, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8643248081207275, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8778806924819946, + "num_tokens": 636239503.0, + "step": 16679 + }, + { + "epoch": 2.1218674468897087, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7845656871795654, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8694759011268616, + "num_tokens": 636278994.0, + "step": 16680 + }, + { + "epoch": 2.1219946571682993, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.898390531539917, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8620844483375549, + "num_tokens": 636318025.0, + "step": 16681 + }, + { + "epoch": 2.12212186744689, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8274226188659668, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8679990768432617, + "num_tokens": 636357289.0, + "step": 16682 + }, + { + "epoch": 2.1222490777254803, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.007026195526123, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8509470224380493, + "num_tokens": 636399067.0, + "step": 16683 + }, + { + "epoch": 2.122376288004071, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7804248332977295, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8713058829307556, + "num_tokens": 636442418.0, + "step": 16684 + }, + { + "epoch": 2.1225034982826614, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.813035488128662, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8542739748954773, + "num_tokens": 636484273.0, + "step": 16685 + }, + { + "epoch": 2.122630708561252, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8098478317260742, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8734574317932129, + "num_tokens": 636525675.0, + "step": 16686 + }, + { + "epoch": 2.1227579188398424, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.873053789138794, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.865058183670044, + "num_tokens": 636559652.0, + "step": 16687 + }, + { + "epoch": 2.122885129118433, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9252468347549438, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8861769437789917, + "num_tokens": 636597900.0, + "step": 16688 + }, + { + "epoch": 2.1230123393970235, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9042695760726929, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8666058778762817, + "num_tokens": 636635822.0, + "step": 16689 + }, + { + "epoch": 2.1231395496756136, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9618091583251953, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8558751344680786, + "num_tokens": 636667275.0, + "step": 16690 + }, + { + "epoch": 2.123266759954204, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9673720598220825, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8679643869400024, + "num_tokens": 636704723.0, + "step": 16691 + }, + { + "epoch": 2.1233939702327946, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8763800859451294, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8708795309066772, + "num_tokens": 636748209.0, + "step": 16692 + }, + { + "epoch": 2.123521180511385, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9906502962112427, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8710635304450989, + "num_tokens": 636785052.0, + "step": 16693 + }, + { + "epoch": 2.1236483907899757, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.035261631011963, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.878844141960144, + "num_tokens": 636816073.0, + "step": 16694 + }, + { + "epoch": 2.123775601068566, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0078046321868896, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8715454339981079, + "num_tokens": 636848963.0, + "step": 16695 + }, + { + "epoch": 2.1239028113471567, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.4406309127807617, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8775648474693298, + "num_tokens": 636880632.0, + "step": 16696 + }, + { + "epoch": 2.1240300216257473, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0175821781158447, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8530420064926147, + "num_tokens": 636923597.0, + "step": 16697 + }, + { + "epoch": 2.124157231904338, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9071491956710815, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.860824704170227, + "num_tokens": 636962250.0, + "step": 16698 + }, + { + "epoch": 2.1242844421829283, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9337663650512695, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8736563920974731, + "num_tokens": 636997973.0, + "step": 16699 + }, + { + "epoch": 2.124411652461519, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.003990411758423, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8544385433197021, + "num_tokens": 637037320.0, + "step": 16700 + }, + { + "epoch": 2.1245388627401094, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.967982292175293, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8434252738952637, + "num_tokens": 637072267.0, + "step": 16701 + }, + { + "epoch": 2.1246660730187, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9471490383148193, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8811950087547302, + "num_tokens": 637108782.0, + "step": 16702 + }, + { + "epoch": 2.1247932832972904, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8537129163742065, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.865595817565918, + "num_tokens": 637147368.0, + "step": 16703 + }, + { + "epoch": 2.124920493575881, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.888037085533142, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8749819993972778, + "num_tokens": 637181909.0, + "step": 16704 + }, + { + "epoch": 2.1250477038544715, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.021170139312744, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8642810583114624, + "num_tokens": 637218498.0, + "step": 16705 + }, + { + "epoch": 2.125174914133062, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9094997644424438, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.866848886013031, + "num_tokens": 637254825.0, + "step": 16706 + }, + { + "epoch": 2.1253021244116526, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9731508493423462, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8562162518501282, + "num_tokens": 637301593.0, + "step": 16707 + }, + { + "epoch": 2.125429334690243, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7299818992614746, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.877536952495575, + "num_tokens": 637341676.0, + "step": 16708 + }, + { + "epoch": 2.1255565449688336, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8298425674438477, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8711709976196289, + "num_tokens": 637378840.0, + "step": 16709 + }, + { + "epoch": 2.125683755247424, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0139970779418945, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8481158018112183, + "num_tokens": 637412711.0, + "step": 16710 + }, + { + "epoch": 2.1258109655260147, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.150550365447998, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8637433648109436, + "num_tokens": 637448287.0, + "step": 16711 + }, + { + "epoch": 2.125938175804605, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.885299563407898, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8641507029533386, + "num_tokens": 637487899.0, + "step": 16712 + }, + { + "epoch": 2.1260653860831957, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.023954153060913, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8700525760650635, + "num_tokens": 637528161.0, + "step": 16713 + }, + { + "epoch": 2.1261925963617863, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8846046924591064, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8536828756332397, + "num_tokens": 637565692.0, + "step": 16714 + }, + { + "epoch": 2.1263198066403763, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0308685302734375, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8542223572731018, + "num_tokens": 637600368.0, + "step": 16715 + }, + { + "epoch": 2.126447016918967, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.914894461631775, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8629311323165894, + "num_tokens": 637635732.0, + "step": 16716 + }, + { + "epoch": 2.1265742271975574, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.773989200592041, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.861144483089447, + "num_tokens": 637677127.0, + "step": 16717 + }, + { + "epoch": 2.126701437476148, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.827441930770874, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.865221381187439, + "num_tokens": 637715278.0, + "step": 16718 + }, + { + "epoch": 2.1268286477547385, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0420584678649902, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8693114519119263, + "num_tokens": 637742937.0, + "step": 16719 + }, + { + "epoch": 2.126955858033329, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9597774744033813, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8681781888008118, + "num_tokens": 637776761.0, + "step": 16720 + }, + { + "epoch": 2.1270830683119195, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9512474536895752, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8696617484092712, + "num_tokens": 637812197.0, + "step": 16721 + }, + { + "epoch": 2.12721027859051, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9267032146453857, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.870925784111023, + "num_tokens": 637850442.0, + "step": 16722 + }, + { + "epoch": 2.1273374888691006, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8519506454467773, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.867441713809967, + "num_tokens": 637889352.0, + "step": 16723 + }, + { + "epoch": 2.127464699147691, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8851808309555054, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8477331399917603, + "num_tokens": 637927195.0, + "step": 16724 + }, + { + "epoch": 2.1275919094262816, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.827350378036499, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.85408616065979, + "num_tokens": 637969431.0, + "step": 16725 + }, + { + "epoch": 2.127719119704872, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7183876037597656, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8820474147796631, + "num_tokens": 638013910.0, + "step": 16726 + }, + { + "epoch": 2.1278463299834627, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9085924625396729, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8549265265464783, + "num_tokens": 638051848.0, + "step": 16727 + }, + { + "epoch": 2.127973540262053, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9808682203292847, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8666723966598511, + "num_tokens": 638092192.0, + "step": 16728 + }, + { + "epoch": 2.1281007505406437, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8399642705917358, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8766318559646606, + "num_tokens": 638134306.0, + "step": 16729 + }, + { + "epoch": 2.1282279608192343, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.78571355342865, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8520582914352417, + "num_tokens": 638177988.0, + "step": 16730 + }, + { + "epoch": 2.128355171097825, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.105724811553955, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8769875168800354, + "num_tokens": 638212237.0, + "step": 16731 + }, + { + "epoch": 2.1284823813764153, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.057325839996338, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.858613133430481, + "num_tokens": 638252492.0, + "step": 16732 + }, + { + "epoch": 2.128609591655006, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9259387254714966, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8652639389038086, + "num_tokens": 638294308.0, + "step": 16733 + }, + { + "epoch": 2.1287368019335964, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9498612880706787, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8711212873458862, + "num_tokens": 638330178.0, + "step": 16734 + }, + { + "epoch": 2.128864012212187, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0670106410980225, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8467954993247986, + "num_tokens": 638372731.0, + "step": 16735 + }, + { + "epoch": 2.1289912224907774, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.923980474472046, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8624398708343506, + "num_tokens": 638407522.0, + "step": 16736 + }, + { + "epoch": 2.129118432769368, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9855289459228516, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8678179979324341, + "num_tokens": 638443262.0, + "step": 16737 + }, + { + "epoch": 2.129245643047958, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8845125436782837, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8663045763969421, + "num_tokens": 638482729.0, + "step": 16738 + }, + { + "epoch": 2.129372853326549, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8555798530578613, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8740010857582092, + "num_tokens": 638520034.0, + "step": 16739 + }, + { + "epoch": 2.129500063605139, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8502610921859741, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8739215135574341, + "num_tokens": 638558631.0, + "step": 16740 + }, + { + "epoch": 2.1296272738837296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9435365200042725, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.877578854560852, + "num_tokens": 638593565.0, + "step": 16741 + }, + { + "epoch": 2.12975448416232, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.040952682495117, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8701337575912476, + "num_tokens": 638625952.0, + "step": 16742 + }, + { + "epoch": 2.1298816944409107, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.010859727859497, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8630493879318237, + "num_tokens": 638662000.0, + "step": 16743 + }, + { + "epoch": 2.130008904719501, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8410276174545288, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8524872064590454, + "num_tokens": 638706020.0, + "step": 16744 + }, + { + "epoch": 2.1301361149980917, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9651685953140259, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8722196221351624, + "num_tokens": 638745354.0, + "step": 16745 + }, + { + "epoch": 2.1302633252766823, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8430172204971313, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8606684803962708, + "num_tokens": 638785497.0, + "step": 16746 + }, + { + "epoch": 2.130390535555273, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.081106662750244, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.844944179058075, + "num_tokens": 638821650.0, + "step": 16747 + }, + { + "epoch": 2.1305177458338633, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0564889907836914, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8519030809402466, + "num_tokens": 638855589.0, + "step": 16748 + }, + { + "epoch": 2.130644956112454, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9251272678375244, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8645214438438416, + "num_tokens": 638895766.0, + "step": 16749 + }, + { + "epoch": 2.1307721663910444, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.007904052734375, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.855986475944519, + "num_tokens": 638932622.0, + "step": 16750 + }, + { + "epoch": 2.130899376669635, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7984247207641602, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8568079471588135, + "num_tokens": 638971588.0, + "step": 16751 + }, + { + "epoch": 2.1310265869482254, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8627638816833496, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.870715856552124, + "num_tokens": 639009921.0, + "step": 16752 + }, + { + "epoch": 2.131153797226816, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.909913420677185, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8656777739524841, + "num_tokens": 639048160.0, + "step": 16753 + }, + { + "epoch": 2.1312810075054065, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8944658041000366, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8780066967010498, + "num_tokens": 639080260.0, + "step": 16754 + }, + { + "epoch": 2.131408217783997, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8511552810668945, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8584041595458984, + "num_tokens": 639118758.0, + "step": 16755 + }, + { + "epoch": 2.1315354280625876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.958569049835205, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8654109239578247, + "num_tokens": 639159528.0, + "step": 16756 + }, + { + "epoch": 2.131662638341178, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8167390823364258, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8698441982269287, + "num_tokens": 639200394.0, + "step": 16757 + }, + { + "epoch": 2.1317898486197686, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8665027618408203, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8582603931427002, + "num_tokens": 639240938.0, + "step": 16758 + }, + { + "epoch": 2.131917058898359, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9834667444229126, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8595783710479736, + "num_tokens": 639278097.0, + "step": 16759 + }, + { + "epoch": 2.1320442691769497, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7179110050201416, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8657063245773315, + "num_tokens": 639324512.0, + "step": 16760 + }, + { + "epoch": 2.13217147945554, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6838407516479492, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8703641891479492, + "num_tokens": 639366833.0, + "step": 16761 + }, + { + "epoch": 2.1322986897341307, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8157894611358643, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8504555225372314, + "num_tokens": 639405765.0, + "step": 16762 + }, + { + "epoch": 2.132425900012721, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.072502374649048, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8625661134719849, + "num_tokens": 639439622.0, + "step": 16763 + }, + { + "epoch": 2.1325531102913113, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.942897081375122, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8760206699371338, + "num_tokens": 639479943.0, + "step": 16764 + }, + { + "epoch": 2.132680320569902, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9595280885696411, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8604315519332886, + "num_tokens": 639520487.0, + "step": 16765 + }, + { + "epoch": 2.1328075308484924, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.920772671699524, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8648262619972229, + "num_tokens": 639558997.0, + "step": 16766 + }, + { + "epoch": 2.132934741127083, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.893714427947998, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8645800948143005, + "num_tokens": 639594569.0, + "step": 16767 + }, + { + "epoch": 2.1330619514056735, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7915176153182983, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8797571659088135, + "num_tokens": 639629827.0, + "step": 16768 + }, + { + "epoch": 2.133189161684264, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9169560670852661, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8589823246002197, + "num_tokens": 639667634.0, + "step": 16769 + }, + { + "epoch": 2.1333163719628545, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.092613697052002, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8625047206878662, + "num_tokens": 639700536.0, + "step": 16770 + }, + { + "epoch": 2.133443582241445, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8414362668991089, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.868661642074585, + "num_tokens": 639744006.0, + "step": 16771 + }, + { + "epoch": 2.1335707925200356, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.601179122924805, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8718143105506897, + "num_tokens": 639784950.0, + "step": 16772 + }, + { + "epoch": 2.133698002798626, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.216925859451294, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8793518543243408, + "num_tokens": 639823567.0, + "step": 16773 + }, + { + "epoch": 2.1338252130772166, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.122915744781494, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8766549229621887, + "num_tokens": 639863386.0, + "step": 16774 + }, + { + "epoch": 2.133952423355807, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9098942279815674, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8543097972869873, + "num_tokens": 639901219.0, + "step": 16775 + }, + { + "epoch": 2.1340796336343977, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.108546495437622, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.850492000579834, + "num_tokens": 639935240.0, + "step": 16776 + }, + { + "epoch": 2.134206843912988, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8321658372879028, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8571070432662964, + "num_tokens": 639975370.0, + "step": 16777 + }, + { + "epoch": 2.1343340541915787, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7816177606582642, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8464353084564209, + "num_tokens": 640020463.0, + "step": 16778 + }, + { + "epoch": 2.1344612644701693, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8939374685287476, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8610345125198364, + "num_tokens": 640057142.0, + "step": 16779 + }, + { + "epoch": 2.13458847474876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8094221353530884, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8624357581138611, + "num_tokens": 640098879.0, + "step": 16780 + }, + { + "epoch": 2.1347156850273503, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8596646785736084, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8682596683502197, + "num_tokens": 640136077.0, + "step": 16781 + }, + { + "epoch": 2.134842895305941, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9019896984100342, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8519037365913391, + "num_tokens": 640175160.0, + "step": 16782 + }, + { + "epoch": 2.1349701055845314, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7735381126403809, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8899370431900024, + "num_tokens": 640216670.0, + "step": 16783 + }, + { + "epoch": 2.135097315863122, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8980342149734497, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8622693419456482, + "num_tokens": 640254872.0, + "step": 16784 + }, + { + "epoch": 2.1352245261417124, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9218107461929321, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8628620505332947, + "num_tokens": 640289319.0, + "step": 16785 + }, + { + "epoch": 2.135351736420303, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8649650812149048, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8794589042663574, + "num_tokens": 640323897.0, + "step": 16786 + }, + { + "epoch": 2.1354789466988935, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8479729890823364, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8722514510154724, + "num_tokens": 640366203.0, + "step": 16787 + }, + { + "epoch": 2.1356061569774836, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7205946445465088, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.874833345413208, + "num_tokens": 640408089.0, + "step": 16788 + }, + { + "epoch": 2.135733367256074, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8686715364456177, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8757350444793701, + "num_tokens": 640443532.0, + "step": 16789 + }, + { + "epoch": 2.1358605775346646, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.000636577606201, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8589556217193604, + "num_tokens": 640480425.0, + "step": 16790 + }, + { + "epoch": 2.135987787813255, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7427393198013306, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8688850402832031, + "num_tokens": 640525658.0, + "step": 16791 + }, + { + "epoch": 2.1361149980918457, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.150390625, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8749253749847412, + "num_tokens": 640563276.0, + "step": 16792 + }, + { + "epoch": 2.136242208370436, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7665678262710571, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8586263656616211, + "num_tokens": 640608150.0, + "step": 16793 + }, + { + "epoch": 2.1363694186490267, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7235112190246582, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.867878794670105, + "num_tokens": 640646127.0, + "step": 16794 + }, + { + "epoch": 2.1364966289276173, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0246529579162598, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8721991181373596, + "num_tokens": 640683195.0, + "step": 16795 + }, + { + "epoch": 2.136623839206208, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8476368188858032, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8794236183166504, + "num_tokens": 640724102.0, + "step": 16796 + }, + { + "epoch": 2.1367510494847983, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8108574151992798, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8618893623352051, + "num_tokens": 640762624.0, + "step": 16797 + }, + { + "epoch": 2.136878259763389, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0038373470306396, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8600901365280151, + "num_tokens": 640799437.0, + "step": 16798 + }, + { + "epoch": 2.1370054700419794, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.863137125968933, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8709944486618042, + "num_tokens": 640837115.0, + "step": 16799 + }, + { + "epoch": 2.13713268032057, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6773627996444702, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8675233125686646, + "num_tokens": 640882312.0, + "step": 16800 + }, + { + "epoch": 2.1372598905991604, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0006320476531982, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8654320240020752, + "num_tokens": 640921032.0, + "step": 16801 + }, + { + "epoch": 2.137387100877751, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0834481716156006, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8678517937660217, + "num_tokens": 640957424.0, + "step": 16802 + }, + { + "epoch": 2.1375143111563415, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0747029781341553, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.85816490650177, + "num_tokens": 640992985.0, + "step": 16803 + }, + { + "epoch": 2.137641521434932, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9126291275024414, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8765789270401001, + "num_tokens": 641030379.0, + "step": 16804 + }, + { + "epoch": 2.1377687317135226, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.91193687915802, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8576360940933228, + "num_tokens": 641068526.0, + "step": 16805 + }, + { + "epoch": 2.137895941992113, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9776636362075806, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8570382595062256, + "num_tokens": 641103351.0, + "step": 16806 + }, + { + "epoch": 2.1380231522707036, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 80.52233123779297, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.868245005607605, + "num_tokens": 641143570.0, + "step": 16807 + }, + { + "epoch": 2.138150362549294, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8940188884735107, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8694056272506714, + "num_tokens": 641180625.0, + "step": 16808 + }, + { + "epoch": 2.1382775728278847, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.965065360069275, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8438891768455505, + "num_tokens": 641226770.0, + "step": 16809 + }, + { + "epoch": 2.138404783106475, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.754663348197937, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8769595623016357, + "num_tokens": 641269682.0, + "step": 16810 + }, + { + "epoch": 2.1385319933850653, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9398386478424072, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8777965307235718, + "num_tokens": 641303803.0, + "step": 16811 + }, + { + "epoch": 2.1386592036636562, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8027359247207642, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8604179620742798, + "num_tokens": 641343141.0, + "step": 16812 + }, + { + "epoch": 2.1387864139422463, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8379212617874146, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8692757487297058, + "num_tokens": 641384006.0, + "step": 16813 + }, + { + "epoch": 2.138913624220837, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9607253074645996, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8441741466522217, + "num_tokens": 641423998.0, + "step": 16814 + }, + { + "epoch": 2.1390408344994274, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0004079341888428, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8642547726631165, + "num_tokens": 641457886.0, + "step": 16815 + }, + { + "epoch": 2.139168044778018, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7019641399383545, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8668252825737, + "num_tokens": 641503386.0, + "step": 16816 + }, + { + "epoch": 2.1392952550566084, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8679907321929932, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8761533498764038, + "num_tokens": 641539533.0, + "step": 16817 + }, + { + "epoch": 2.139422465335199, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4555552005767822, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8600078821182251, + "num_tokens": 641574020.0, + "step": 16818 + }, + { + "epoch": 2.1395496756137895, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2355661392211914, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8707935214042664, + "num_tokens": 641605206.0, + "step": 16819 + }, + { + "epoch": 2.13967688589238, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8265597820281982, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8554925918579102, + "num_tokens": 641649857.0, + "step": 16820 + }, + { + "epoch": 2.1398040961709706, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0494627952575684, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8593341112136841, + "num_tokens": 641683360.0, + "step": 16821 + }, + { + "epoch": 2.139931306449561, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.956190824508667, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8660004734992981, + "num_tokens": 641720105.0, + "step": 16822 + }, + { + "epoch": 2.1400585167281516, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9960275888442993, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8682538270950317, + "num_tokens": 641750718.0, + "step": 16823 + }, + { + "epoch": 2.140185727006742, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7696425914764404, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8640215992927551, + "num_tokens": 641791620.0, + "step": 16824 + }, + { + "epoch": 2.1403129372853327, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9570643901824951, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8539056777954102, + "num_tokens": 641833566.0, + "step": 16825 + }, + { + "epoch": 2.140440147563923, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9872101545333862, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8723025321960449, + "num_tokens": 641869080.0, + "step": 16826 + }, + { + "epoch": 2.1405673578425137, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9249528646469116, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8629505038261414, + "num_tokens": 641910195.0, + "step": 16827 + }, + { + "epoch": 2.1406945681211043, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.027794361114502, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8527368307113647, + "num_tokens": 641947270.0, + "step": 16828 + }, + { + "epoch": 2.140821778399695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8204960823059082, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8674075603485107, + "num_tokens": 641986472.0, + "step": 16829 + }, + { + "epoch": 2.1409489886782853, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9741779565811157, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.853268027305603, + "num_tokens": 642028130.0, + "step": 16830 + }, + { + "epoch": 2.141076198956876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8878358602523804, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.860906720161438, + "num_tokens": 642067097.0, + "step": 16831 + }, + { + "epoch": 2.1412034092354664, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.052269220352173, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8768763542175293, + "num_tokens": 642099628.0, + "step": 16832 + }, + { + "epoch": 2.141330619514057, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0088605880737305, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8888481855392456, + "num_tokens": 642135941.0, + "step": 16833 + }, + { + "epoch": 2.1414578297926474, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.097653865814209, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8583608865737915, + "num_tokens": 642169465.0, + "step": 16834 + }, + { + "epoch": 2.141585040071238, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9518898725509644, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8678757548332214, + "num_tokens": 642208961.0, + "step": 16835 + }, + { + "epoch": 2.141712250349828, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.962489366531372, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8632614612579346, + "num_tokens": 642249691.0, + "step": 16836 + }, + { + "epoch": 2.141839460628419, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.896554708480835, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8682800531387329, + "num_tokens": 642292425.0, + "step": 16837 + }, + { + "epoch": 2.141966670907009, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7876453399658203, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8759620785713196, + "num_tokens": 642330982.0, + "step": 16838 + }, + { + "epoch": 2.1420938811855996, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9769959449768066, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8658450841903687, + "num_tokens": 642365414.0, + "step": 16839 + }, + { + "epoch": 2.14222109146419, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.000689744949341, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8589807152748108, + "num_tokens": 642398660.0, + "step": 16840 + }, + { + "epoch": 2.1423483017427807, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.104956865310669, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8733292818069458, + "num_tokens": 642432605.0, + "step": 16841 + }, + { + "epoch": 2.142475512021371, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4059345722198486, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8753021955490112, + "num_tokens": 642472041.0, + "step": 16842 + }, + { + "epoch": 2.1426027222999617, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8741798400878906, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8753930926322937, + "num_tokens": 642512399.0, + "step": 16843 + }, + { + "epoch": 2.1427299325785523, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9218168258666992, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8591058254241943, + "num_tokens": 642550829.0, + "step": 16844 + }, + { + "epoch": 2.142857142857143, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9829050302505493, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8648530840873718, + "num_tokens": 642591255.0, + "step": 16845 + }, + { + "epoch": 2.1429843531357333, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0582261085510254, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8670974373817444, + "num_tokens": 642625772.0, + "step": 16846 + }, + { + "epoch": 2.143111563414324, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8451350927352905, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8586505651473999, + "num_tokens": 642667596.0, + "step": 16847 + }, + { + "epoch": 2.1432387736929144, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9017598628997803, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8669736385345459, + "num_tokens": 642702159.0, + "step": 16848 + }, + { + "epoch": 2.143365983971505, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1849429607391357, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8800261616706848, + "num_tokens": 642737307.0, + "step": 16849 + }, + { + "epoch": 2.1434931942500954, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9550366401672363, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8552159070968628, + "num_tokens": 642776660.0, + "step": 16850 + }, + { + "epoch": 2.143620404528686, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9093466997146606, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8599170446395874, + "num_tokens": 642814131.0, + "step": 16851 + }, + { + "epoch": 2.1437476148072765, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9669981002807617, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8460361957550049, + "num_tokens": 642854138.0, + "step": 16852 + }, + { + "epoch": 2.143874825085867, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8164232969284058, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8718850612640381, + "num_tokens": 642893767.0, + "step": 16853 + }, + { + "epoch": 2.1440020353644575, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8531911373138428, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8809913992881775, + "num_tokens": 642928286.0, + "step": 16854 + }, + { + "epoch": 2.144129245643048, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7661665678024292, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.873664379119873, + "num_tokens": 642971220.0, + "step": 16855 + }, + { + "epoch": 2.1442564559216386, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6979703903198242, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8703067898750305, + "num_tokens": 643013618.0, + "step": 16856 + }, + { + "epoch": 2.144383666200229, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6985870599746704, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8682600259780884, + "num_tokens": 643058122.0, + "step": 16857 + }, + { + "epoch": 2.1445108764788197, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8339307308197021, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8532567620277405, + "num_tokens": 643096662.0, + "step": 16858 + }, + { + "epoch": 2.14463808675741, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.044823408126831, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8798373341560364, + "num_tokens": 643137923.0, + "step": 16859 + }, + { + "epoch": 2.1447652970360007, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9761840105056763, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8722120523452759, + "num_tokens": 643176918.0, + "step": 16860 + }, + { + "epoch": 2.144892507314591, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.036919355392456, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.884341299533844, + "num_tokens": 643217525.0, + "step": 16861 + }, + { + "epoch": 2.1450197175931813, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8206520080566406, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8830891847610474, + "num_tokens": 643255131.0, + "step": 16862 + }, + { + "epoch": 2.145146927871772, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9071071147918701, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8755593299865723, + "num_tokens": 643292092.0, + "step": 16863 + }, + { + "epoch": 2.1452741381503624, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8831912279129028, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8616297245025635, + "num_tokens": 643330510.0, + "step": 16864 + }, + { + "epoch": 2.145401348428953, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7826588153839111, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8610049486160278, + "num_tokens": 643372900.0, + "step": 16865 + }, + { + "epoch": 2.1455285587075434, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7540035247802734, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8739223480224609, + "num_tokens": 643412866.0, + "step": 16866 + }, + { + "epoch": 2.145655768986134, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8623782396316528, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8500756025314331, + "num_tokens": 643452846.0, + "step": 16867 + }, + { + "epoch": 2.1457829792647245, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9694435596466064, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8718676567077637, + "num_tokens": 643488264.0, + "step": 16868 + }, + { + "epoch": 2.145910189543315, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8298203945159912, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8540589213371277, + "num_tokens": 643528131.0, + "step": 16869 + }, + { + "epoch": 2.1460373998219056, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9173977375030518, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8656196594238281, + "num_tokens": 643564041.0, + "step": 16870 + }, + { + "epoch": 2.146164610100496, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9731619358062744, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8546383380889893, + "num_tokens": 643601507.0, + "step": 16871 + }, + { + "epoch": 2.1462918203790866, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.905738353729248, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8509683609008789, + "num_tokens": 643647161.0, + "step": 16872 + }, + { + "epoch": 2.146419030657677, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8657214641571045, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.871834933757782, + "num_tokens": 643687114.0, + "step": 16873 + }, + { + "epoch": 2.1465462409362677, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8592861890792847, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8518823981285095, + "num_tokens": 643722110.0, + "step": 16874 + }, + { + "epoch": 2.146673451214858, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9700725078582764, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8435921669006348, + "num_tokens": 643765770.0, + "step": 16875 + }, + { + "epoch": 2.1468006614934487, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8530292510986328, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.856784462928772, + "num_tokens": 643810097.0, + "step": 16876 + }, + { + "epoch": 2.1469278717720393, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.950853943824768, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8393359184265137, + "num_tokens": 643847904.0, + "step": 16877 + }, + { + "epoch": 2.14705508205063, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8214190006256104, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8737130165100098, + "num_tokens": 643889157.0, + "step": 16878 + }, + { + "epoch": 2.1471822923292203, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7813314199447632, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8695724010467529, + "num_tokens": 643931181.0, + "step": 16879 + }, + { + "epoch": 2.147309502607811, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8906549215316772, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.865526556968689, + "num_tokens": 643965415.0, + "step": 16880 + }, + { + "epoch": 2.1474367128864014, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8775990009307861, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8509101867675781, + "num_tokens": 644010647.0, + "step": 16881 + }, + { + "epoch": 2.147563923164992, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9444257020950317, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.882945716381073, + "num_tokens": 644044347.0, + "step": 16882 + }, + { + "epoch": 2.1476911334435824, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.181032180786133, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.873407244682312, + "num_tokens": 644083456.0, + "step": 16883 + }, + { + "epoch": 2.147818343722173, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2079544067382812, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8560854196548462, + "num_tokens": 644123851.0, + "step": 16884 + }, + { + "epoch": 2.1479455540007635, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1916236877441406, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8763257265090942, + "num_tokens": 644168880.0, + "step": 16885 + }, + { + "epoch": 2.1480727642793536, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.015138864517212, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8672323822975159, + "num_tokens": 644204532.0, + "step": 16886 + }, + { + "epoch": 2.148199974557944, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7728796005249023, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8849020004272461, + "num_tokens": 644246160.0, + "step": 16887 + }, + { + "epoch": 2.1483271848365346, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0601084232330322, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8793715834617615, + "num_tokens": 644284881.0, + "step": 16888 + }, + { + "epoch": 2.148454395115125, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0496058464050293, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8601481318473816, + "num_tokens": 644326661.0, + "step": 16889 + }, + { + "epoch": 2.1485816053937157, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.840967059135437, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8716444969177246, + "num_tokens": 644364420.0, + "step": 16890 + }, + { + "epoch": 2.148708815672306, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0505263805389404, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8494187593460083, + "num_tokens": 644398626.0, + "step": 16891 + }, + { + "epoch": 2.1488360259508967, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.078465700149536, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8665059208869934, + "num_tokens": 644434772.0, + "step": 16892 + }, + { + "epoch": 2.1489632362294873, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.968984603881836, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8624352812767029, + "num_tokens": 644474425.0, + "step": 16893 + }, + { + "epoch": 2.149090446508078, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.1345131397247314, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8563879728317261, + "num_tokens": 644507796.0, + "step": 16894 + }, + { + "epoch": 2.1492176567866683, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.969728946685791, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8610105514526367, + "num_tokens": 644544172.0, + "step": 16895 + }, + { + "epoch": 2.149344867065259, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9449118375778198, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8648132085800171, + "num_tokens": 644582217.0, + "step": 16896 + }, + { + "epoch": 2.1494720773438494, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8444061279296875, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8727548718452454, + "num_tokens": 644620028.0, + "step": 16897 + }, + { + "epoch": 2.14959928762244, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.904610514640808, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.850175678730011, + "num_tokens": 644663743.0, + "step": 16898 + }, + { + "epoch": 2.1497264979010304, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8691471815109253, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8684711456298828, + "num_tokens": 644700914.0, + "step": 16899 + }, + { + "epoch": 2.149853708179621, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.904052734375, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8620209693908691, + "num_tokens": 644744232.0, + "step": 16900 + }, + { + "epoch": 2.1499809184582115, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9914677143096924, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8577265739440918, + "num_tokens": 644780851.0, + "step": 16901 + }, + { + "epoch": 2.150108128736802, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.949867606163025, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.874321699142456, + "num_tokens": 644822103.0, + "step": 16902 + }, + { + "epoch": 2.1502353390153925, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0032594203948975, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8644181489944458, + "num_tokens": 644861209.0, + "step": 16903 + }, + { + "epoch": 2.150362549293983, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.84206223487854, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8525937795639038, + "num_tokens": 644899860.0, + "step": 16904 + }, + { + "epoch": 2.1504897595725736, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8758572340011597, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8600122928619385, + "num_tokens": 644943101.0, + "step": 16905 + }, + { + "epoch": 2.150616969851164, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.011522054672241, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8546550273895264, + "num_tokens": 644981786.0, + "step": 16906 + }, + { + "epoch": 2.1507441801297547, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.890123724937439, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8682096004486084, + "num_tokens": 645017949.0, + "step": 16907 + }, + { + "epoch": 2.150871390408345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8740715980529785, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8648075461387634, + "num_tokens": 645053443.0, + "step": 16908 + }, + { + "epoch": 2.1509986006869353, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.03859543800354, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8671057820320129, + "num_tokens": 645089916.0, + "step": 16909 + }, + { + "epoch": 2.1511258109655262, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8897674083709717, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.849105954170227, + "num_tokens": 645134531.0, + "step": 16910 + }, + { + "epoch": 2.1512530212441163, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9561320543289185, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8605839014053345, + "num_tokens": 645172608.0, + "step": 16911 + }, + { + "epoch": 2.151380231522707, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0086700916290283, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8622081875801086, + "num_tokens": 645209895.0, + "step": 16912 + }, + { + "epoch": 2.1515074418012974, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9218742847442627, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8724446296691895, + "num_tokens": 645245259.0, + "step": 16913 + }, + { + "epoch": 2.151634652079888, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.957155704498291, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8616264462471008, + "num_tokens": 645284962.0, + "step": 16914 + }, + { + "epoch": 2.1517618623584784, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8923941850662231, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8787568807601929, + "num_tokens": 645322352.0, + "step": 16915 + }, + { + "epoch": 2.151889072637069, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.065059185028076, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8645824790000916, + "num_tokens": 645358401.0, + "step": 16916 + }, + { + "epoch": 2.1520162829156595, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8831490278244019, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8784459829330444, + "num_tokens": 645399147.0, + "step": 16917 + }, + { + "epoch": 2.15214349319425, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9460362195968628, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8486325740814209, + "num_tokens": 645437286.0, + "step": 16918 + }, + { + "epoch": 2.1522707034728406, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8002759218215942, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8580414056777954, + "num_tokens": 645477853.0, + "step": 16919 + }, + { + "epoch": 2.152397913751431, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8385841846466064, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8562177419662476, + "num_tokens": 645520476.0, + "step": 16920 + }, + { + "epoch": 2.1525251240300216, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8727912902832031, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8496583104133606, + "num_tokens": 645561539.0, + "step": 16921 + }, + { + "epoch": 2.152652334308612, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.807604432106018, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8607449531555176, + "num_tokens": 645602950.0, + "step": 16922 + }, + { + "epoch": 2.1527795445872027, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.992977261543274, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8562081456184387, + "num_tokens": 645639215.0, + "step": 16923 + }, + { + "epoch": 2.152906754865793, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9317102432250977, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8642392158508301, + "num_tokens": 645677287.0, + "step": 16924 + }, + { + "epoch": 2.1530339651443837, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.075321912765503, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8727498054504395, + "num_tokens": 645710784.0, + "step": 16925 + }, + { + "epoch": 2.1531611754229742, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.920802116394043, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8612339496612549, + "num_tokens": 645750568.0, + "step": 16926 + }, + { + "epoch": 2.1532883857015648, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1160972118377686, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8686252236366272, + "num_tokens": 645783911.0, + "step": 16927 + }, + { + "epoch": 2.1534155959801553, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2361879348754883, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8810054063796997, + "num_tokens": 645822524.0, + "step": 16928 + }, + { + "epoch": 2.153542806258746, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2608087062835693, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8598323464393616, + "num_tokens": 645856924.0, + "step": 16929 + }, + { + "epoch": 2.1536700165373364, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9570666551589966, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8583352565765381, + "num_tokens": 645896995.0, + "step": 16930 + }, + { + "epoch": 2.153797226815927, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6783472299575806, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.87172931432724, + "num_tokens": 645941768.0, + "step": 16931 + }, + { + "epoch": 2.1539244370945174, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0203733444213867, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8631479740142822, + "num_tokens": 645981909.0, + "step": 16932 + }, + { + "epoch": 2.154051647373108, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8713960647583008, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8554540872573853, + "num_tokens": 646022034.0, + "step": 16933 + }, + { + "epoch": 2.154178857651698, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9427073001861572, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8641761541366577, + "num_tokens": 646056714.0, + "step": 16934 + }, + { + "epoch": 2.1543060679302886, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.75874662399292, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8563930988311768, + "num_tokens": 646095668.0, + "step": 16935 + }, + { + "epoch": 2.154433278208879, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1233103275299072, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8493466377258301, + "num_tokens": 646136745.0, + "step": 16936 + }, + { + "epoch": 2.1545604884874696, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.866512417793274, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8603999614715576, + "num_tokens": 646180314.0, + "step": 16937 + }, + { + "epoch": 2.15468769876606, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7909252643585205, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.86748868227005, + "num_tokens": 646218191.0, + "step": 16938 + }, + { + "epoch": 2.1548149090446507, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7779263257980347, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8808141946792603, + "num_tokens": 646256350.0, + "step": 16939 + }, + { + "epoch": 2.154942119323241, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.445770263671875, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8874707221984863, + "num_tokens": 646290915.0, + "step": 16940 + }, + { + "epoch": 2.1550693296018317, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9969514608383179, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8574145436286926, + "num_tokens": 646325838.0, + "step": 16941 + }, + { + "epoch": 2.1551965398804223, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9513523578643799, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8460084199905396, + "num_tokens": 646362742.0, + "step": 16942 + }, + { + "epoch": 2.155323750159013, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9767404794692993, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8522411584854126, + "num_tokens": 646403958.0, + "step": 16943 + }, + { + "epoch": 2.1554509604376033, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9487093687057495, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8561632633209229, + "num_tokens": 646442872.0, + "step": 16944 + }, + { + "epoch": 2.155578170716194, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9919086694717407, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8706543445587158, + "num_tokens": 646484642.0, + "step": 16945 + }, + { + "epoch": 2.1557053809947844, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8617395162582397, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.874090313911438, + "num_tokens": 646521688.0, + "step": 16946 + }, + { + "epoch": 2.155832591273375, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9331179857254028, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8721962571144104, + "num_tokens": 646560182.0, + "step": 16947 + }, + { + "epoch": 2.1559598015519654, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0149073600769043, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8695892095565796, + "num_tokens": 646593252.0, + "step": 16948 + }, + { + "epoch": 2.156087011830556, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9311456680297852, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8627015352249146, + "num_tokens": 646635514.0, + "step": 16949 + }, + { + "epoch": 2.1562142221091465, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9367449283599854, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8521180152893066, + "num_tokens": 646674785.0, + "step": 16950 + }, + { + "epoch": 2.156341432387737, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3407325744628906, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8646289110183716, + "num_tokens": 646713832.0, + "step": 16951 + }, + { + "epoch": 2.1564686426663275, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.88070547580719, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8658891320228577, + "num_tokens": 646752218.0, + "step": 16952 + }, + { + "epoch": 2.156595852944918, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9688069820404053, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8641458749771118, + "num_tokens": 646786196.0, + "step": 16953 + }, + { + "epoch": 2.1567230632235086, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0352423191070557, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8569977283477783, + "num_tokens": 646821462.0, + "step": 16954 + }, + { + "epoch": 2.156850273502099, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8582501411437988, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8714423775672913, + "num_tokens": 646856613.0, + "step": 16955 + }, + { + "epoch": 2.1569774837806897, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8811320066452026, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8766641616821289, + "num_tokens": 646893013.0, + "step": 16956 + }, + { + "epoch": 2.15710469405928, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.775858759880066, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8691431283950806, + "num_tokens": 646936180.0, + "step": 16957 + }, + { + "epoch": 2.1572319043378707, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8839772939682007, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8648352026939392, + "num_tokens": 646979314.0, + "step": 16958 + }, + { + "epoch": 2.157359114616461, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0172038078308105, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8757476210594177, + "num_tokens": 647020136.0, + "step": 16959 + }, + { + "epoch": 2.1574863248950513, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.314023017883301, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8689204454421997, + "num_tokens": 647052970.0, + "step": 16960 + }, + { + "epoch": 2.157613535173642, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9297926425933838, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.858970046043396, + "num_tokens": 647093195.0, + "step": 16961 + }, + { + "epoch": 2.1577407454522324, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9972363710403442, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8651996850967407, + "num_tokens": 647133800.0, + "step": 16962 + }, + { + "epoch": 2.157867955730823, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.107218027114868, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8499941825866699, + "num_tokens": 647167704.0, + "step": 16963 + }, + { + "epoch": 2.1579951660094134, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.876289963722229, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8516613841056824, + "num_tokens": 647208474.0, + "step": 16964 + }, + { + "epoch": 2.158122376288004, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8968464136123657, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8784341812133789, + "num_tokens": 647242131.0, + "step": 16965 + }, + { + "epoch": 2.1582495865665945, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9479234218597412, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8447009325027466, + "num_tokens": 647280952.0, + "step": 16966 + }, + { + "epoch": 2.158376796845185, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9664407968521118, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8586472272872925, + "num_tokens": 647318411.0, + "step": 16967 + }, + { + "epoch": 2.1585040071237755, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9121779203414917, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8523749113082886, + "num_tokens": 647361002.0, + "step": 16968 + }, + { + "epoch": 2.158631217402366, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7953249216079712, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8672204613685608, + "num_tokens": 647402995.0, + "step": 16969 + }, + { + "epoch": 2.1587584276809566, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8106502294540405, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.866592288017273, + "num_tokens": 647441220.0, + "step": 16970 + }, + { + "epoch": 2.158885637959547, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.919612169265747, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8585595488548279, + "num_tokens": 647476832.0, + "step": 16971 + }, + { + "epoch": 2.1590128482381377, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.870823621749878, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8673235177993774, + "num_tokens": 647516855.0, + "step": 16972 + }, + { + "epoch": 2.159140058516728, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8347457647323608, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.862179696559906, + "num_tokens": 647556913.0, + "step": 16973 + }, + { + "epoch": 2.1592672687953187, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.950206995010376, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8818457126617432, + "num_tokens": 647591242.0, + "step": 16974 + }, + { + "epoch": 2.1593944790739092, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7870882749557495, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8688739538192749, + "num_tokens": 647632156.0, + "step": 16975 + }, + { + "epoch": 2.1595216893524998, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.026275396347046, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8709452152252197, + "num_tokens": 647669857.0, + "step": 16976 + }, + { + "epoch": 2.1596488996310903, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.940061330795288, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8527771830558777, + "num_tokens": 647715580.0, + "step": 16977 + }, + { + "epoch": 2.159776109909681, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0059890747070312, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8549624085426331, + "num_tokens": 647755917.0, + "step": 16978 + }, + { + "epoch": 2.1599033201882714, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7568737268447876, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8805035352706909, + "num_tokens": 647802205.0, + "step": 16979 + }, + { + "epoch": 2.160030530466862, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7581743001937866, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8520519733428955, + "num_tokens": 647846627.0, + "step": 16980 + }, + { + "epoch": 2.1601577407454524, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8475158214569092, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8562546968460083, + "num_tokens": 647885033.0, + "step": 16981 + }, + { + "epoch": 2.160284951024043, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8200980424880981, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8552782535552979, + "num_tokens": 647923370.0, + "step": 16982 + }, + { + "epoch": 2.1604121613026335, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8997586965560913, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8711121678352356, + "num_tokens": 647963431.0, + "step": 16983 + }, + { + "epoch": 2.1605393715812236, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8171486854553223, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8569570779800415, + "num_tokens": 648002854.0, + "step": 16984 + }, + { + "epoch": 2.160666581859814, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9753726720809937, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8518291115760803, + "num_tokens": 648041082.0, + "step": 16985 + }, + { + "epoch": 2.1607937921384046, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1650609970092773, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.855457067489624, + "num_tokens": 648082109.0, + "step": 16986 + }, + { + "epoch": 2.160921002416995, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9795433282852173, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8794025182723999, + "num_tokens": 648117398.0, + "step": 16987 + }, + { + "epoch": 2.1610482126955857, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8276047706604004, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8724880218505859, + "num_tokens": 648158997.0, + "step": 16988 + }, + { + "epoch": 2.161175422974176, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9310071468353271, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8623054027557373, + "num_tokens": 648194334.0, + "step": 16989 + }, + { + "epoch": 2.1613026332527667, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.870287537574768, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8601415753364563, + "num_tokens": 648230601.0, + "step": 16990 + }, + { + "epoch": 2.1614298435313573, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8296539783477783, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8596522212028503, + "num_tokens": 648274979.0, + "step": 16991 + }, + { + "epoch": 2.161557053809948, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.922157883644104, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8487280011177063, + "num_tokens": 648319229.0, + "step": 16992 + }, + { + "epoch": 2.1616842640885383, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9203529357910156, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8778289556503296, + "num_tokens": 648354550.0, + "step": 16993 + }, + { + "epoch": 2.161811474367129, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8624619245529175, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8767886161804199, + "num_tokens": 648389820.0, + "step": 16994 + }, + { + "epoch": 2.1619386846457194, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9081792831420898, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8794004321098328, + "num_tokens": 648423841.0, + "step": 16995 + }, + { + "epoch": 2.16206589492431, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9881094694137573, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8519104719161987, + "num_tokens": 648459865.0, + "step": 16996 + }, + { + "epoch": 2.1621931052029004, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.856346845626831, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8823784589767456, + "num_tokens": 648492639.0, + "step": 16997 + }, + { + "epoch": 2.162320315481491, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8920955657958984, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8710296750068665, + "num_tokens": 648533456.0, + "step": 16998 + }, + { + "epoch": 2.1624475257600815, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8112820386886597, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8785163760185242, + "num_tokens": 648571955.0, + "step": 16999 + }, + { + "epoch": 2.162574736038672, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7346404790878296, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8808871507644653, + "num_tokens": 648614284.0, + "step": 17000 + }, + { + "epoch": 2.1627019463172625, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8362425565719604, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8741787672042847, + "num_tokens": 648654399.0, + "step": 17001 + }, + { + "epoch": 2.162829156595853, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.970689058303833, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8688492178916931, + "num_tokens": 648694490.0, + "step": 17002 + }, + { + "epoch": 2.1629563668744436, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.822334885597229, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8706598281860352, + "num_tokens": 648730665.0, + "step": 17003 + }, + { + "epoch": 2.163083577153034, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0360186100006104, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8685867786407471, + "num_tokens": 648765289.0, + "step": 17004 + }, + { + "epoch": 2.1632107874316246, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8897361755371094, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8666978478431702, + "num_tokens": 648803982.0, + "step": 17005 + }, + { + "epoch": 2.163337997710215, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0558154582977295, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8696995973587036, + "num_tokens": 648840992.0, + "step": 17006 + }, + { + "epoch": 2.1634652079888053, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0410268306732178, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.861710250377655, + "num_tokens": 648883621.0, + "step": 17007 + }, + { + "epoch": 2.1635924182673962, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8896675109863281, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8515738248825073, + "num_tokens": 648922258.0, + "step": 17008 + }, + { + "epoch": 2.1637196285459863, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8730287551879883, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8639541864395142, + "num_tokens": 648965010.0, + "step": 17009 + }, + { + "epoch": 2.163846838824577, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7634570598602295, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8605878353118896, + "num_tokens": 649008124.0, + "step": 17010 + }, + { + "epoch": 2.1639740491031674, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8702874183654785, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8648370504379272, + "num_tokens": 649044743.0, + "step": 17011 + }, + { + "epoch": 2.164101259381758, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8804606199264526, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8782873749732971, + "num_tokens": 649078410.0, + "step": 17012 + }, + { + "epoch": 2.1642284696603484, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.951186180114746, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8754339218139648, + "num_tokens": 649115415.0, + "step": 17013 + }, + { + "epoch": 2.164355679938939, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0473732948303223, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.862221360206604, + "num_tokens": 649156266.0, + "step": 17014 + }, + { + "epoch": 2.1644828902175295, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.062110662460327, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8721159100532532, + "num_tokens": 649192552.0, + "step": 17015 + }, + { + "epoch": 2.16461010049612, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1674046516418457, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8681412935256958, + "num_tokens": 649224163.0, + "step": 17016 + }, + { + "epoch": 2.1647373107747105, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2603485584259033, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8859105110168457, + "num_tokens": 649259348.0, + "step": 17017 + }, + { + "epoch": 2.164864521053301, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.179743766784668, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8798137903213501, + "num_tokens": 649296875.0, + "step": 17018 + }, + { + "epoch": 2.1649917313318916, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1454663276672363, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8513007164001465, + "num_tokens": 649333455.0, + "step": 17019 + }, + { + "epoch": 2.165118941610482, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0071299076080322, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8675951957702637, + "num_tokens": 649364567.0, + "step": 17020 + }, + { + "epoch": 2.1652461518890727, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.149643659591675, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8476877808570862, + "num_tokens": 649400579.0, + "step": 17021 + }, + { + "epoch": 2.165373362167663, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1773767471313477, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8774170875549316, + "num_tokens": 649432940.0, + "step": 17022 + }, + { + "epoch": 2.1655005724462537, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0445852279663086, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8706761002540588, + "num_tokens": 649466695.0, + "step": 17023 + }, + { + "epoch": 2.1656277827248442, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8573771715164185, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8629191517829895, + "num_tokens": 649504578.0, + "step": 17024 + }, + { + "epoch": 2.1657549930034348, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.069779872894287, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.85423344373703, + "num_tokens": 649541536.0, + "step": 17025 + }, + { + "epoch": 2.1658822032820253, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0588085651397705, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8640307188034058, + "num_tokens": 649581280.0, + "step": 17026 + }, + { + "epoch": 2.166009413560616, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9625624418258667, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8872407674789429, + "num_tokens": 649619837.0, + "step": 17027 + }, + { + "epoch": 2.1661366238392064, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9775394201278687, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8767592310905457, + "num_tokens": 649654708.0, + "step": 17028 + }, + { + "epoch": 2.166263834117797, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.10296368598938, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8445966243743896, + "num_tokens": 649691709.0, + "step": 17029 + }, + { + "epoch": 2.1663910443963874, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8276468515396118, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8611569404602051, + "num_tokens": 649734876.0, + "step": 17030 + }, + { + "epoch": 2.166518254674978, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.906517744064331, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8743678331375122, + "num_tokens": 649775477.0, + "step": 17031 + }, + { + "epoch": 2.166645464953568, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7064913511276245, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8773742914199829, + "num_tokens": 649816136.0, + "step": 17032 + }, + { + "epoch": 2.1667726752321586, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0660226345062256, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8735426664352417, + "num_tokens": 649847942.0, + "step": 17033 + }, + { + "epoch": 2.166899885510749, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9461132287979126, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8619632124900818, + "num_tokens": 649887875.0, + "step": 17034 + }, + { + "epoch": 2.1670270957893396, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.036543130874634, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8529199957847595, + "num_tokens": 649920557.0, + "step": 17035 + }, + { + "epoch": 2.16715430606793, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.954417109489441, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8745561838150024, + "num_tokens": 649959704.0, + "step": 17036 + }, + { + "epoch": 2.1672815163465207, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7939687967300415, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8714317679405212, + "num_tokens": 650000433.0, + "step": 17037 + }, + { + "epoch": 2.167408726625111, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7816742658615112, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.877056360244751, + "num_tokens": 650038793.0, + "step": 17038 + }, + { + "epoch": 2.1675359369037017, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9454578161239624, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8755505681037903, + "num_tokens": 650072877.0, + "step": 17039 + }, + { + "epoch": 2.1676631471822922, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.926870346069336, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8540170192718506, + "num_tokens": 650109245.0, + "step": 17040 + }, + { + "epoch": 2.1677903574608828, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.89901864528656, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8797869682312012, + "num_tokens": 650150254.0, + "step": 17041 + }, + { + "epoch": 2.1679175677394733, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9410895109176636, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8452840447425842, + "num_tokens": 650189587.0, + "step": 17042 + }, + { + "epoch": 2.168044778018064, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0465517044067383, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8626253604888916, + "num_tokens": 650225443.0, + "step": 17043 + }, + { + "epoch": 2.1681719882966544, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9194332361221313, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.861518144607544, + "num_tokens": 650262319.0, + "step": 17044 + }, + { + "epoch": 2.168299198575245, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9037408828735352, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8482657670974731, + "num_tokens": 650297137.0, + "step": 17045 + }, + { + "epoch": 2.1684264088538354, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.976943016052246, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8634794354438782, + "num_tokens": 650335521.0, + "step": 17046 + }, + { + "epoch": 2.168553619132426, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9441789388656616, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8599718809127808, + "num_tokens": 650373050.0, + "step": 17047 + }, + { + "epoch": 2.1686808294110165, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8498222827911377, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8721466064453125, + "num_tokens": 650407438.0, + "step": 17048 + }, + { + "epoch": 2.168808039689607, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.840482234954834, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.861011266708374, + "num_tokens": 650447793.0, + "step": 17049 + }, + { + "epoch": 2.1689352499681975, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9713106155395508, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8614478707313538, + "num_tokens": 650487167.0, + "step": 17050 + }, + { + "epoch": 2.169062460246788, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1767783164978027, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8522275686264038, + "num_tokens": 650522960.0, + "step": 17051 + }, + { + "epoch": 2.1691896705253786, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0040183067321777, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8586618900299072, + "num_tokens": 650564161.0, + "step": 17052 + }, + { + "epoch": 2.169316880803969, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.031789779663086, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8720439672470093, + "num_tokens": 650599201.0, + "step": 17053 + }, + { + "epoch": 2.1694440910825596, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9999727010726929, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8467069268226624, + "num_tokens": 650637385.0, + "step": 17054 + }, + { + "epoch": 2.16957130136115, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0649921894073486, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8659915924072266, + "num_tokens": 650671954.0, + "step": 17055 + }, + { + "epoch": 2.1696985116397407, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8601223230361938, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8818949460983276, + "num_tokens": 650708475.0, + "step": 17056 + }, + { + "epoch": 2.169825721918331, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.844508171081543, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8682314157485962, + "num_tokens": 650748089.0, + "step": 17057 + }, + { + "epoch": 2.1699529321969213, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7377853393554688, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8677159547805786, + "num_tokens": 650790869.0, + "step": 17058 + }, + { + "epoch": 2.170080142475512, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8896182775497437, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8512804508209229, + "num_tokens": 650830697.0, + "step": 17059 + }, + { + "epoch": 2.1702073527541024, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8713366985321045, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8746756315231323, + "num_tokens": 650868755.0, + "step": 17060 + }, + { + "epoch": 2.170334563032693, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9781912565231323, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8762121200561523, + "num_tokens": 650908019.0, + "step": 17061 + }, + { + "epoch": 2.1704617733112834, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7696655988693237, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8640868663787842, + "num_tokens": 650954195.0, + "step": 17062 + }, + { + "epoch": 2.170588983589874, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.752214789390564, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8804814219474792, + "num_tokens": 650992740.0, + "step": 17063 + }, + { + "epoch": 2.1707161938684645, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9366284608840942, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8790995478630066, + "num_tokens": 651031524.0, + "step": 17064 + }, + { + "epoch": 2.170843404147055, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9995920658111572, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.865236222743988, + "num_tokens": 651070004.0, + "step": 17065 + }, + { + "epoch": 2.1709706144256455, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4210801124572754, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8518718481063843, + "num_tokens": 651104474.0, + "step": 17066 + }, + { + "epoch": 2.171097824704236, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8250428438186646, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8578875064849854, + "num_tokens": 651148913.0, + "step": 17067 + }, + { + "epoch": 2.1712250349828266, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8800668716430664, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8730175495147705, + "num_tokens": 651192072.0, + "step": 17068 + }, + { + "epoch": 2.171352245261417, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0877904891967773, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8577784895896912, + "num_tokens": 651226132.0, + "step": 17069 + }, + { + "epoch": 2.1714794555400077, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.957323670387268, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8678678870201111, + "num_tokens": 651263467.0, + "step": 17070 + }, + { + "epoch": 2.171606665818598, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9682435989379883, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8560178279876709, + "num_tokens": 651309521.0, + "step": 17071 + }, + { + "epoch": 2.1717338760971887, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9130834341049194, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8680118322372437, + "num_tokens": 651347026.0, + "step": 17072 + }, + { + "epoch": 2.1718610863757792, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.97983980178833, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8736620545387268, + "num_tokens": 651380545.0, + "step": 17073 + }, + { + "epoch": 2.1719882966543698, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1723580360412598, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8711239099502563, + "num_tokens": 651409428.0, + "step": 17074 + }, + { + "epoch": 2.1721155069329603, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9192966222763062, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8520558476448059, + "num_tokens": 651447993.0, + "step": 17075 + }, + { + "epoch": 2.172242717211551, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.953621506690979, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8599746227264404, + "num_tokens": 651488210.0, + "step": 17076 + }, + { + "epoch": 2.1723699274901414, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9701138734817505, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8619089722633362, + "num_tokens": 651525742.0, + "step": 17077 + }, + { + "epoch": 2.172497137768732, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.917201042175293, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8713924884796143, + "num_tokens": 651565569.0, + "step": 17078 + }, + { + "epoch": 2.1726243480473224, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9399070739746094, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.86076420545578, + "num_tokens": 651605291.0, + "step": 17079 + }, + { + "epoch": 2.172751558325913, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8640186786651611, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8668159246444702, + "num_tokens": 651641998.0, + "step": 17080 + }, + { + "epoch": 2.1728787686045035, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.944464087486267, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8635280132293701, + "num_tokens": 651683040.0, + "step": 17081 + }, + { + "epoch": 2.1730059788830935, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.952302098274231, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8742567300796509, + "num_tokens": 651718067.0, + "step": 17082 + }, + { + "epoch": 2.173133189161684, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7768906354904175, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.881352424621582, + "num_tokens": 651761995.0, + "step": 17083 + }, + { + "epoch": 2.1732603994402746, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9179487228393555, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8805937767028809, + "num_tokens": 651797780.0, + "step": 17084 + }, + { + "epoch": 2.173387609718865, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9189902544021606, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8537049293518066, + "num_tokens": 651832931.0, + "step": 17085 + }, + { + "epoch": 2.1735148199974557, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0289220809936523, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8598344922065735, + "num_tokens": 651868440.0, + "step": 17086 + }, + { + "epoch": 2.173642030276046, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.101499319076538, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8678486943244934, + "num_tokens": 651900029.0, + "step": 17087 + }, + { + "epoch": 2.1737692405546367, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.997119665145874, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8612915277481079, + "num_tokens": 651944892.0, + "step": 17088 + }, + { + "epoch": 2.1738964508332272, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2135727405548096, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8725471496582031, + "num_tokens": 651979330.0, + "step": 17089 + }, + { + "epoch": 2.1740236611118178, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.971866488456726, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8569978475570679, + "num_tokens": 652017389.0, + "step": 17090 + }, + { + "epoch": 2.1741508713904083, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1099088191986084, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8618072271347046, + "num_tokens": 652052260.0, + "step": 17091 + }, + { + "epoch": 2.174278081668999, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.153862714767456, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.850799560546875, + "num_tokens": 652089917.0, + "step": 17092 + }, + { + "epoch": 2.1744052919475894, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.777018666267395, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8691992163658142, + "num_tokens": 652129849.0, + "step": 17093 + }, + { + "epoch": 2.17453250222618, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0664145946502686, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8564554452896118, + "num_tokens": 652163178.0, + "step": 17094 + }, + { + "epoch": 2.1746597125047704, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9431484937667847, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8717783689498901, + "num_tokens": 652201521.0, + "step": 17095 + }, + { + "epoch": 2.174786922783361, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9237141609191895, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8665459752082825, + "num_tokens": 652234820.0, + "step": 17096 + }, + { + "epoch": 2.1749141330619515, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9381327629089355, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8620777726173401, + "num_tokens": 652275686.0, + "step": 17097 + }, + { + "epoch": 2.175041343340542, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.078793525695801, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8564736843109131, + "num_tokens": 652313243.0, + "step": 17098 + }, + { + "epoch": 2.1751685536191325, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8195568323135376, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8810276985168457, + "num_tokens": 652351550.0, + "step": 17099 + }, + { + "epoch": 2.175295763897723, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.870949387550354, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8626277446746826, + "num_tokens": 652399834.0, + "step": 17100 + }, + { + "epoch": 2.1754229741763136, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0114104747772217, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8517812490463257, + "num_tokens": 652435839.0, + "step": 17101 + }, + { + "epoch": 2.175550184454904, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0076966285705566, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8619645833969116, + "num_tokens": 652475747.0, + "step": 17102 + }, + { + "epoch": 2.1756773947334946, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8876900672912598, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8769997954368591, + "num_tokens": 652517147.0, + "step": 17103 + }, + { + "epoch": 2.175804605012085, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8486801385879517, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8773285150527954, + "num_tokens": 652552811.0, + "step": 17104 + }, + { + "epoch": 2.1759318152906753, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9753657579421997, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8724629878997803, + "num_tokens": 652587709.0, + "step": 17105 + }, + { + "epoch": 2.1760590255692662, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0589306354522705, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8642553091049194, + "num_tokens": 652626213.0, + "step": 17106 + }, + { + "epoch": 2.1761862358478563, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0438177585601807, + "learning_rate": 1e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8375624418258667, + "num_tokens": 652665692.0, + "step": 17107 + }, + { + "epoch": 2.176313446126447, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9942831993103027, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8569397926330566, + "num_tokens": 652705515.0, + "step": 17108 + }, + { + "epoch": 2.1764406564050374, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8808120489120483, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8767541646957397, + "num_tokens": 652743343.0, + "step": 17109 + }, + { + "epoch": 2.176567866683628, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9506756067276, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8594979047775269, + "num_tokens": 652784587.0, + "step": 17110 + }, + { + "epoch": 2.1766950769622184, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8601157665252686, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8571515083312988, + "num_tokens": 652821985.0, + "step": 17111 + }, + { + "epoch": 2.176822287240809, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9876277446746826, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8762555718421936, + "num_tokens": 652855772.0, + "step": 17112 + }, + { + "epoch": 2.1769494975193995, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.958384394645691, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8608070611953735, + "num_tokens": 652890497.0, + "step": 17113 + }, + { + "epoch": 2.17707670779799, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9951074123382568, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8761259913444519, + "num_tokens": 652931611.0, + "step": 17114 + }, + { + "epoch": 2.1772039180765805, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8660989999771118, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8653866052627563, + "num_tokens": 652972129.0, + "step": 17115 + }, + { + "epoch": 2.177331128355171, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.984379768371582, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8584749102592468, + "num_tokens": 653009848.0, + "step": 17116 + }, + { + "epoch": 2.1774583386337616, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.085507869720459, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.862364649772644, + "num_tokens": 653044614.0, + "step": 17117 + }, + { + "epoch": 2.177585548912352, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7470303773880005, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8800405859947205, + "num_tokens": 653087614.0, + "step": 17118 + }, + { + "epoch": 2.1777127591909426, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.003936529159546, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.87815260887146, + "num_tokens": 653120434.0, + "step": 17119 + }, + { + "epoch": 2.177839969469533, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.880852222442627, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.874035120010376, + "num_tokens": 653159390.0, + "step": 17120 + }, + { + "epoch": 2.1779671797481237, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9010854959487915, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8582209348678589, + "num_tokens": 653201045.0, + "step": 17121 + }, + { + "epoch": 2.1780943900267142, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.776407241821289, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8681269288063049, + "num_tokens": 653244968.0, + "step": 17122 + }, + { + "epoch": 2.1782216003053048, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.873024344444275, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8591976165771484, + "num_tokens": 653281157.0, + "step": 17123 + }, + { + "epoch": 2.1783488105838953, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9240864515304565, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8779019117355347, + "num_tokens": 653321055.0, + "step": 17124 + }, + { + "epoch": 2.178476020862486, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0793652534484863, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8531345129013062, + "num_tokens": 653359320.0, + "step": 17125 + }, + { + "epoch": 2.1786032311410763, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8020998239517212, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8856992125511169, + "num_tokens": 653404186.0, + "step": 17126 + }, + { + "epoch": 2.178730441419667, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9051117897033691, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.87493896484375, + "num_tokens": 653438122.0, + "step": 17127 + }, + { + "epoch": 2.1788576516982574, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.075007915496826, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8642661571502686, + "num_tokens": 653474727.0, + "step": 17128 + }, + { + "epoch": 2.178984861976848, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0357680320739746, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8710421919822693, + "num_tokens": 653513531.0, + "step": 17129 + }, + { + "epoch": 2.179112072255438, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.719983696937561, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8783345222473145, + "num_tokens": 653555871.0, + "step": 17130 + }, + { + "epoch": 2.1792392825340285, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.845579743385315, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8676710724830627, + "num_tokens": 653598825.0, + "step": 17131 + }, + { + "epoch": 2.179366492812619, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7766844034194946, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8754487037658691, + "num_tokens": 653639838.0, + "step": 17132 + }, + { + "epoch": 2.1794937030912096, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.906718134880066, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8573756217956543, + "num_tokens": 653678958.0, + "step": 17133 + }, + { + "epoch": 2.1796209133698, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9318432807922363, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8748412132263184, + "num_tokens": 653718043.0, + "step": 17134 + }, + { + "epoch": 2.1797481236483907, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8372204303741455, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8805351853370667, + "num_tokens": 653756308.0, + "step": 17135 + }, + { + "epoch": 2.179875333926981, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.004617929458618, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8733391761779785, + "num_tokens": 653793140.0, + "step": 17136 + }, + { + "epoch": 2.1800025442055717, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2332308292388916, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8709433078765869, + "num_tokens": 653823441.0, + "step": 17137 + }, + { + "epoch": 2.1801297544841622, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.035897731781006, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8571389317512512, + "num_tokens": 653860743.0, + "step": 17138 + }, + { + "epoch": 2.1802569647627528, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1937994956970215, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8774794936180115, + "num_tokens": 653894785.0, + "step": 17139 + }, + { + "epoch": 2.1803841750413433, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.944193959236145, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8628652691841125, + "num_tokens": 653932208.0, + "step": 17140 + }, + { + "epoch": 2.180511385319934, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.066300630569458, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8522392511367798, + "num_tokens": 653968818.0, + "step": 17141 + }, + { + "epoch": 2.1806385955985244, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8486006259918213, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8635725975036621, + "num_tokens": 654010696.0, + "step": 17142 + }, + { + "epoch": 2.180765805877115, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9221068620681763, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8694502711296082, + "num_tokens": 654047632.0, + "step": 17143 + }, + { + "epoch": 2.1808930161557054, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9056295156478882, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8694968223571777, + "num_tokens": 654088200.0, + "step": 17144 + }, + { + "epoch": 2.181020226434296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9583301544189453, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8622837066650391, + "num_tokens": 654124225.0, + "step": 17145 + }, + { + "epoch": 2.1811474367128865, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8306785821914673, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8684260845184326, + "num_tokens": 654163317.0, + "step": 17146 + }, + { + "epoch": 2.181274646991477, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9140632152557373, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8597084879875183, + "num_tokens": 654199208.0, + "step": 17147 + }, + { + "epoch": 2.1814018572700675, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0347421169281006, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8546415567398071, + "num_tokens": 654239988.0, + "step": 17148 + }, + { + "epoch": 2.181529067548658, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0671231746673584, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8589212894439697, + "num_tokens": 654276361.0, + "step": 17149 + }, + { + "epoch": 2.1816562778272486, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8135524988174438, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8685636520385742, + "num_tokens": 654316690.0, + "step": 17150 + }, + { + "epoch": 2.181783488105839, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.749654769897461, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8763226270675659, + "num_tokens": 654354112.0, + "step": 17151 + }, + { + "epoch": 2.1819106983844296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7768449783325195, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8778942823410034, + "num_tokens": 654398896.0, + "step": 17152 + }, + { + "epoch": 2.18203790866302, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0175933837890625, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8793851733207703, + "num_tokens": 654435180.0, + "step": 17153 + }, + { + "epoch": 2.1821651189416107, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0623717308044434, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8536222577095032, + "num_tokens": 654473958.0, + "step": 17154 + }, + { + "epoch": 2.1822923292202008, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.041072368621826, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8520163297653198, + "num_tokens": 654512918.0, + "step": 17155 + }, + { + "epoch": 2.1824195394987913, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9957767724990845, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8725961446762085, + "num_tokens": 654552296.0, + "step": 17156 + }, + { + "epoch": 2.182546749777382, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9268524646759033, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8723426461219788, + "num_tokens": 654592088.0, + "step": 17157 + }, + { + "epoch": 2.1826739600559724, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.825554609298706, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.858768105506897, + "num_tokens": 654635119.0, + "step": 17158 + }, + { + "epoch": 2.182801170334563, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9322150945663452, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8534141778945923, + "num_tokens": 654671204.0, + "step": 17159 + }, + { + "epoch": 2.1829283806131534, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.20518159866333, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8643158674240112, + "num_tokens": 654702986.0, + "step": 17160 + }, + { + "epoch": 2.183055590891744, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9642690420150757, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8419805765151978, + "num_tokens": 654745813.0, + "step": 17161 + }, + { + "epoch": 2.1831828011703345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8834058046340942, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8651560544967651, + "num_tokens": 654783305.0, + "step": 17162 + }, + { + "epoch": 2.183310011448925, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.074697256088257, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8737524747848511, + "num_tokens": 654823427.0, + "step": 17163 + }, + { + "epoch": 2.1834372217275155, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.852820634841919, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8794285655021667, + "num_tokens": 654862400.0, + "step": 17164 + }, + { + "epoch": 2.183564432006106, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8522740602493286, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8696503639221191, + "num_tokens": 654902484.0, + "step": 17165 + }, + { + "epoch": 2.1836916422846966, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8140878677368164, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.872312605381012, + "num_tokens": 654946882.0, + "step": 17166 + }, + { + "epoch": 2.183818852563287, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.028930902481079, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8568983674049377, + "num_tokens": 654977850.0, + "step": 17167 + }, + { + "epoch": 2.1839460628418776, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0735132694244385, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8512088656425476, + "num_tokens": 655015352.0, + "step": 17168 + }, + { + "epoch": 2.184073273120468, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0786209106445312, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8669381141662598, + "num_tokens": 655047488.0, + "step": 17169 + }, + { + "epoch": 2.1842004833990587, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9174410104751587, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8632787466049194, + "num_tokens": 655083410.0, + "step": 17170 + }, + { + "epoch": 2.1843276936776492, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9273905754089355, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.875352680683136, + "num_tokens": 655120594.0, + "step": 17171 + }, + { + "epoch": 2.1844549039562398, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.004570245742798, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.864219605922699, + "num_tokens": 655157310.0, + "step": 17172 + }, + { + "epoch": 2.1845821142348303, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9012831449508667, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8746687173843384, + "num_tokens": 655193749.0, + "step": 17173 + }, + { + "epoch": 2.184709324513421, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9375290870666504, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8797273635864258, + "num_tokens": 655235532.0, + "step": 17174 + }, + { + "epoch": 2.1848365347920113, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7676796913146973, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8727326989173889, + "num_tokens": 655282154.0, + "step": 17175 + }, + { + "epoch": 2.184963745070602, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9296983480453491, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.859218955039978, + "num_tokens": 655321342.0, + "step": 17176 + }, + { + "epoch": 2.1850909553491924, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8493788242340088, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8832676410675049, + "num_tokens": 655358107.0, + "step": 17177 + }, + { + "epoch": 2.185218165627783, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8955674171447754, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8595976829528809, + "num_tokens": 655393462.0, + "step": 17178 + }, + { + "epoch": 2.1853453759063735, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8875377178192139, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8669579029083252, + "num_tokens": 655436114.0, + "step": 17179 + }, + { + "epoch": 2.1854725861849635, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.14709734916687, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8682988882064819, + "num_tokens": 655473119.0, + "step": 17180 + }, + { + "epoch": 2.185599796463554, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1387810707092285, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.861387312412262, + "num_tokens": 655515537.0, + "step": 17181 + }, + { + "epoch": 2.1857270067421446, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9343619346618652, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8595218658447266, + "num_tokens": 655555944.0, + "step": 17182 + }, + { + "epoch": 2.185854217020735, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1320607662200928, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8493455648422241, + "num_tokens": 655589952.0, + "step": 17183 + }, + { + "epoch": 2.1859814272993257, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8170922994613647, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8777008056640625, + "num_tokens": 655628269.0, + "step": 17184 + }, + { + "epoch": 2.186108637577916, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1693227291107178, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8695696592330933, + "num_tokens": 655657315.0, + "step": 17185 + }, + { + "epoch": 2.1862358478565067, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8710228204727173, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8663438558578491, + "num_tokens": 655696117.0, + "step": 17186 + }, + { + "epoch": 2.1863630581350972, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0046300888061523, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8482437133789062, + "num_tokens": 655738823.0, + "step": 17187 + }, + { + "epoch": 2.1864902684136878, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7514995336532593, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8561710119247437, + "num_tokens": 655781457.0, + "step": 17188 + }, + { + "epoch": 2.1866174786922783, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.71479332447052, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8611106872558594, + "num_tokens": 655824704.0, + "step": 17189 + }, + { + "epoch": 2.186744688970869, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.89882230758667, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8764549493789673, + "num_tokens": 655860013.0, + "step": 17190 + }, + { + "epoch": 2.1868718992494594, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8195542097091675, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8631633520126343, + "num_tokens": 655897146.0, + "step": 17191 + }, + { + "epoch": 2.18699910952805, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9126814603805542, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8710727095603943, + "num_tokens": 655934299.0, + "step": 17192 + }, + { + "epoch": 2.1871263198066404, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6871954202651978, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8714035749435425, + "num_tokens": 655976554.0, + "step": 17193 + }, + { + "epoch": 2.187253530085231, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0236525535583496, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8784275650978088, + "num_tokens": 656011756.0, + "step": 17194 + }, + { + "epoch": 2.1873807403638215, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.875147819519043, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8629459142684937, + "num_tokens": 656054405.0, + "step": 17195 + }, + { + "epoch": 2.187507950642412, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.016465902328491, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8702714443206787, + "num_tokens": 656094668.0, + "step": 17196 + }, + { + "epoch": 2.1876351609210025, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.526381254196167, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8645024299621582, + "num_tokens": 656137664.0, + "step": 17197 + }, + { + "epoch": 2.187762371199593, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9066622257232666, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8585643768310547, + "num_tokens": 656180611.0, + "step": 17198 + }, + { + "epoch": 2.1878895814781836, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9192349910736084, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8571946620941162, + "num_tokens": 656217789.0, + "step": 17199 + }, + { + "epoch": 2.188016791756774, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7991105318069458, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8632160425186157, + "num_tokens": 656261940.0, + "step": 17200 + }, + { + "epoch": 2.1881440020353646, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9692879915237427, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8760639429092407, + "num_tokens": 656296000.0, + "step": 17201 + }, + { + "epoch": 2.188271212313955, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7961046695709229, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8760010004043579, + "num_tokens": 656340862.0, + "step": 17202 + }, + { + "epoch": 2.1883984225925452, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8047385215759277, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8919658660888672, + "num_tokens": 656376923.0, + "step": 17203 + }, + { + "epoch": 2.188525632871136, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.6494414806365967, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8628151416778564, + "num_tokens": 656414058.0, + "step": 17204 + }, + { + "epoch": 2.1886528431497263, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.049560785293579, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8565449714660645, + "num_tokens": 656450077.0, + "step": 17205 + }, + { + "epoch": 2.188780053428317, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0420563220977783, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8570448160171509, + "num_tokens": 656487258.0, + "step": 17206 + }, + { + "epoch": 2.1889072637069074, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8934154510498047, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8797930479049683, + "num_tokens": 656525141.0, + "step": 17207 + }, + { + "epoch": 2.189034473985498, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8585751056671143, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8701642751693726, + "num_tokens": 656561517.0, + "step": 17208 + }, + { + "epoch": 2.1891616842640884, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8962568044662476, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8717517256736755, + "num_tokens": 656595859.0, + "step": 17209 + }, + { + "epoch": 2.189288894542679, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1511471271514893, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8618938326835632, + "num_tokens": 656627652.0, + "step": 17210 + }, + { + "epoch": 2.1894161048212695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.797688603401184, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8725030422210693, + "num_tokens": 656669951.0, + "step": 17211 + }, + { + "epoch": 2.18954331509986, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.126680612564087, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8783855438232422, + "num_tokens": 656702216.0, + "step": 17212 + }, + { + "epoch": 2.1896705253784505, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8028132915496826, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8561662435531616, + "num_tokens": 656744816.0, + "step": 17213 + }, + { + "epoch": 2.189797735657041, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8330957889556885, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.861674427986145, + "num_tokens": 656785533.0, + "step": 17214 + }, + { + "epoch": 2.1899249459356316, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0167274475097656, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8645867705345154, + "num_tokens": 656825375.0, + "step": 17215 + }, + { + "epoch": 2.190052156214222, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9480873346328735, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8745627403259277, + "num_tokens": 656857951.0, + "step": 17216 + }, + { + "epoch": 2.1901793664928126, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7905935049057007, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8756052255630493, + "num_tokens": 656896911.0, + "step": 17217 + }, + { + "epoch": 2.190306576771403, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 3.027796745300293, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8726782202720642, + "num_tokens": 656935723.0, + "step": 17218 + }, + { + "epoch": 2.1904337870499937, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9077562093734741, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8781282901763916, + "num_tokens": 656970881.0, + "step": 17219 + }, + { + "epoch": 2.1905609973285842, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9281526803970337, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8606169819831848, + "num_tokens": 657007721.0, + "step": 17220 + }, + { + "epoch": 2.1906882076071748, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9220515489578247, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8770506381988525, + "num_tokens": 657042359.0, + "step": 17221 + }, + { + "epoch": 2.1908154178857653, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.778687596321106, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8726097345352173, + "num_tokens": 657084662.0, + "step": 17222 + }, + { + "epoch": 2.190942628164356, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8961725234985352, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8673399090766907, + "num_tokens": 657122002.0, + "step": 17223 + }, + { + "epoch": 2.1910698384429463, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.967197299003601, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8675339221954346, + "num_tokens": 657158768.0, + "step": 17224 + }, + { + "epoch": 2.191197048721537, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.042229175567627, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8558042049407959, + "num_tokens": 657193445.0, + "step": 17225 + }, + { + "epoch": 2.1913242590001274, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8097100257873535, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.865180492401123, + "num_tokens": 657234922.0, + "step": 17226 + }, + { + "epoch": 2.191451469278718, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.054518699645996, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8570706248283386, + "num_tokens": 657270774.0, + "step": 17227 + }, + { + "epoch": 2.191578679557308, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8963040113449097, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8806980848312378, + "num_tokens": 657307046.0, + "step": 17228 + }, + { + "epoch": 2.1917058898358985, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7692856788635254, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8556511402130127, + "num_tokens": 657352865.0, + "step": 17229 + }, + { + "epoch": 2.191833100114489, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8730254173278809, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8468284606933594, + "num_tokens": 657393372.0, + "step": 17230 + }, + { + "epoch": 2.1919603103930796, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.809236764907837, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8457421064376831, + "num_tokens": 657432445.0, + "step": 17231 + }, + { + "epoch": 2.19208752067167, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.921472191810608, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8658841252326965, + "num_tokens": 657469145.0, + "step": 17232 + }, + { + "epoch": 2.1922147309502606, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7999433279037476, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8802449703216553, + "num_tokens": 657510369.0, + "step": 17233 + }, + { + "epoch": 2.192341941228851, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8772010803222656, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8714996576309204, + "num_tokens": 657550257.0, + "step": 17234 + }, + { + "epoch": 2.1924691515074417, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9767274856567383, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.869500994682312, + "num_tokens": 657587095.0, + "step": 17235 + }, + { + "epoch": 2.1925963617860322, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.141143560409546, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8708170056343079, + "num_tokens": 657619710.0, + "step": 17236 + }, + { + "epoch": 2.1927235720646228, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.013615608215332, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.862085223197937, + "num_tokens": 657653316.0, + "step": 17237 + }, + { + "epoch": 2.1928507823432133, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.846165418624878, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8613909482955933, + "num_tokens": 657693570.0, + "step": 17238 + }, + { + "epoch": 2.192977992621804, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7448209524154663, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8680671453475952, + "num_tokens": 657736781.0, + "step": 17239 + }, + { + "epoch": 2.1931052029003943, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8435702323913574, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8581287860870361, + "num_tokens": 657775603.0, + "step": 17240 + }, + { + "epoch": 2.193232413178985, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.015331745147705, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8644520044326782, + "num_tokens": 657809065.0, + "step": 17241 + }, + { + "epoch": 2.1933596234575754, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8792288303375244, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.864448070526123, + "num_tokens": 657851369.0, + "step": 17242 + }, + { + "epoch": 2.193486833736166, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7857569456100464, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8893487453460693, + "num_tokens": 657889008.0, + "step": 17243 + }, + { + "epoch": 2.1936140440147565, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 7.77241849899292, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.865312397480011, + "num_tokens": 657925867.0, + "step": 17244 + }, + { + "epoch": 2.193741254293347, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1074225902557373, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.855660080909729, + "num_tokens": 657959502.0, + "step": 17245 + }, + { + "epoch": 2.1938684645719375, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9190309047698975, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8683955073356628, + "num_tokens": 657999063.0, + "step": 17246 + }, + { + "epoch": 2.193995674850528, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9264594316482544, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8545942306518555, + "num_tokens": 658033016.0, + "step": 17247 + }, + { + "epoch": 2.1941228851291186, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.903862714767456, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8726301789283752, + "num_tokens": 658071259.0, + "step": 17248 + }, + { + "epoch": 2.194250095407709, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9228180646896362, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8675398826599121, + "num_tokens": 658103190.0, + "step": 17249 + }, + { + "epoch": 2.1943773056862996, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7605761289596558, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8581809401512146, + "num_tokens": 658143346.0, + "step": 17250 + }, + { + "epoch": 2.19450451596489, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8334299325942993, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.875984787940979, + "num_tokens": 658181747.0, + "step": 17251 + }, + { + "epoch": 2.1946317262434807, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8207446336746216, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8667525053024292, + "num_tokens": 658221770.0, + "step": 17252 + }, + { + "epoch": 2.1947589365220708, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9835894107818604, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8714718818664551, + "num_tokens": 658260157.0, + "step": 17253 + }, + { + "epoch": 2.1948861468006613, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 7.7602763175964355, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.855791449546814, + "num_tokens": 658301815.0, + "step": 17254 + }, + { + "epoch": 2.195013357079252, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.357992172241211, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8688352704048157, + "num_tokens": 658340732.0, + "step": 17255 + }, + { + "epoch": 2.1951405673578424, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0255818367004395, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8590645790100098, + "num_tokens": 658380569.0, + "step": 17256 + }, + { + "epoch": 2.195267777636433, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8908394575119019, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8793210387229919, + "num_tokens": 658415842.0, + "step": 17257 + }, + { + "epoch": 2.1953949879150234, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8684699535369873, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8709366917610168, + "num_tokens": 658452861.0, + "step": 17258 + }, + { + "epoch": 2.195522198193614, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9424231052398682, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8688929677009583, + "num_tokens": 658487019.0, + "step": 17259 + }, + { + "epoch": 2.1956494084722045, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7943611145019531, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8662190437316895, + "num_tokens": 658529290.0, + "step": 17260 + }, + { + "epoch": 2.195776618750795, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8206466436386108, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8605166077613831, + "num_tokens": 658572118.0, + "step": 17261 + }, + { + "epoch": 2.1959038290293855, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8708736896514893, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8632117509841919, + "num_tokens": 658615972.0, + "step": 17262 + }, + { + "epoch": 2.196031039307976, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9416520595550537, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8737860918045044, + "num_tokens": 658653848.0, + "step": 17263 + }, + { + "epoch": 2.1961582495865666, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9071669578552246, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8682972192764282, + "num_tokens": 658686247.0, + "step": 17264 + }, + { + "epoch": 2.196285459865157, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.047185182571411, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8614667654037476, + "num_tokens": 658725787.0, + "step": 17265 + }, + { + "epoch": 2.1964126701437476, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.077324628829956, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8616346120834351, + "num_tokens": 658765957.0, + "step": 17266 + }, + { + "epoch": 2.196539880422338, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.091897964477539, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8765972852706909, + "num_tokens": 658800996.0, + "step": 17267 + }, + { + "epoch": 2.1966670907009287, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.062103509902954, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8642358183860779, + "num_tokens": 658833779.0, + "step": 17268 + }, + { + "epoch": 2.196794300979519, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9117509126663208, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8705886602401733, + "num_tokens": 658874425.0, + "step": 17269 + }, + { + "epoch": 2.1969215112581097, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.863957405090332, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.874251663684845, + "num_tokens": 658912007.0, + "step": 17270 + }, + { + "epoch": 2.1970487215367003, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7971371412277222, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8580821752548218, + "num_tokens": 658956775.0, + "step": 17271 + }, + { + "epoch": 2.197175931815291, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8873608112335205, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8790616989135742, + "num_tokens": 658997132.0, + "step": 17272 + }, + { + "epoch": 2.1973031420938813, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.043767213821411, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8741581439971924, + "num_tokens": 659040437.0, + "step": 17273 + }, + { + "epoch": 2.197430352372472, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0433242321014404, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8815828561782837, + "num_tokens": 659074461.0, + "step": 17274 + }, + { + "epoch": 2.1975575626510624, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7821803092956543, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.862662672996521, + "num_tokens": 659113818.0, + "step": 17275 + }, + { + "epoch": 2.197684772929653, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.016066074371338, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8522931933403015, + "num_tokens": 659149297.0, + "step": 17276 + }, + { + "epoch": 2.1978119832082434, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.067538261413574, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8695333003997803, + "num_tokens": 659187500.0, + "step": 17277 + }, + { + "epoch": 2.1979391934868335, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8399907350540161, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8405330777168274, + "num_tokens": 659228624.0, + "step": 17278 + }, + { + "epoch": 2.198066403765424, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.865018606185913, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8738119006156921, + "num_tokens": 659270317.0, + "step": 17279 + }, + { + "epoch": 2.1981936140440146, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.036682367324829, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8646661043167114, + "num_tokens": 659302133.0, + "step": 17280 + }, + { + "epoch": 2.198320824322605, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.004318952560425, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8727896213531494, + "num_tokens": 659342469.0, + "step": 17281 + }, + { + "epoch": 2.1984480346011956, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.256084442138672, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8784940242767334, + "num_tokens": 659373786.0, + "step": 17282 + }, + { + "epoch": 2.198575244879786, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.896060585975647, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8586093187332153, + "num_tokens": 659412498.0, + "step": 17283 + }, + { + "epoch": 2.1987024551583767, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.052593946456909, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8507131338119507, + "num_tokens": 659446415.0, + "step": 17284 + }, + { + "epoch": 2.1988296654369672, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.912572979927063, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.85822993516922, + "num_tokens": 659488824.0, + "step": 17285 + }, + { + "epoch": 2.1989568757155578, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.028196334838867, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8695119023323059, + "num_tokens": 659523036.0, + "step": 17286 + }, + { + "epoch": 2.1990840859941483, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9112170934677124, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8603924512863159, + "num_tokens": 659562780.0, + "step": 17287 + }, + { + "epoch": 2.199211296272739, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.022639274597168, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8729759454727173, + "num_tokens": 659599364.0, + "step": 17288 + }, + { + "epoch": 2.1993385065513293, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8534190654754639, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8598389625549316, + "num_tokens": 659636275.0, + "step": 17289 + }, + { + "epoch": 2.19946571682992, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.158449649810791, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8654356002807617, + "num_tokens": 659670315.0, + "step": 17290 + }, + { + "epoch": 2.1995929271085104, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8065017461776733, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8612880706787109, + "num_tokens": 659710968.0, + "step": 17291 + }, + { + "epoch": 2.199720137387101, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9526002407073975, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8755706548690796, + "num_tokens": 659751334.0, + "step": 17292 + }, + { + "epoch": 2.1998473476656915, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7656872272491455, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8677505254745483, + "num_tokens": 659793867.0, + "step": 17293 + }, + { + "epoch": 2.199974557944282, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7430797815322876, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8681445717811584, + "num_tokens": 659838724.0, + "step": 17294 + }, + { + "epoch": 2.2001017682228725, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0206544399261475, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8637006282806396, + "num_tokens": 659876589.0, + "step": 17295 + }, + { + "epoch": 2.200228978501463, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8889687061309814, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8586797714233398, + "num_tokens": 659914864.0, + "step": 17296 + }, + { + "epoch": 2.2003561887800536, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9330729246139526, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8744579553604126, + "num_tokens": 659952482.0, + "step": 17297 + }, + { + "epoch": 2.200483399058644, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.7636594772338867, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8664951324462891, + "num_tokens": 659985192.0, + "step": 17298 + }, + { + "epoch": 2.2006106093372346, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1941306591033936, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8751208782196045, + "num_tokens": 660019505.0, + "step": 17299 + }, + { + "epoch": 2.200737819615825, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9461175203323364, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8597100973129272, + "num_tokens": 660054942.0, + "step": 17300 + }, + { + "epoch": 2.2008650298944152, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9004199504852295, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8651626110076904, + "num_tokens": 660092836.0, + "step": 17301 + }, + { + "epoch": 2.200992240173006, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8903096914291382, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8610368967056274, + "num_tokens": 660129419.0, + "step": 17302 + }, + { + "epoch": 2.2011194504515963, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7818461656570435, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8702412843704224, + "num_tokens": 660168116.0, + "step": 17303 + }, + { + "epoch": 2.201246660730187, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8751351833343506, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8609493970870972, + "num_tokens": 660208006.0, + "step": 17304 + }, + { + "epoch": 2.2013738710087773, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7747105360031128, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8660824298858643, + "num_tokens": 660254046.0, + "step": 17305 + }, + { + "epoch": 2.201501081287368, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9618562459945679, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8643524646759033, + "num_tokens": 660289321.0, + "step": 17306 + }, + { + "epoch": 2.2016282915659584, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9898022413253784, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.864088773727417, + "num_tokens": 660325052.0, + "step": 17307 + }, + { + "epoch": 2.201755501844549, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0274767875671387, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8571283221244812, + "num_tokens": 660357942.0, + "step": 17308 + }, + { + "epoch": 2.2018827121231395, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0644142627716064, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8412410616874695, + "num_tokens": 660390654.0, + "step": 17309 + }, + { + "epoch": 2.20200992240173, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8982059955596924, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8660537600517273, + "num_tokens": 660432020.0, + "step": 17310 + }, + { + "epoch": 2.2021371326803205, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9214167594909668, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8664355278015137, + "num_tokens": 660465051.0, + "step": 17311 + }, + { + "epoch": 2.202264342958911, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7653828859329224, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8613001108169556, + "num_tokens": 660504967.0, + "step": 17312 + }, + { + "epoch": 2.2023915532375016, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.863310694694519, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8561235070228577, + "num_tokens": 660543826.0, + "step": 17313 + }, + { + "epoch": 2.202518763516092, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.018918991088867, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8605682849884033, + "num_tokens": 660583074.0, + "step": 17314 + }, + { + "epoch": 2.2026459737946826, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7946785688400269, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8837738037109375, + "num_tokens": 660619974.0, + "step": 17315 + }, + { + "epoch": 2.202773184073273, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9065206050872803, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8717390298843384, + "num_tokens": 660654005.0, + "step": 17316 + }, + { + "epoch": 2.2029003943518637, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9727058410644531, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8555912971496582, + "num_tokens": 660689179.0, + "step": 17317 + }, + { + "epoch": 2.203027604630454, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.943819284439087, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.854383111000061, + "num_tokens": 660724923.0, + "step": 17318 + }, + { + "epoch": 2.2031548149090447, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9211126565933228, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8698435425758362, + "num_tokens": 660760763.0, + "step": 17319 + }, + { + "epoch": 2.2032820251876353, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8581535816192627, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8666301369667053, + "num_tokens": 660795520.0, + "step": 17320 + }, + { + "epoch": 2.203409235466226, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7850648164749146, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8750036954879761, + "num_tokens": 660834858.0, + "step": 17321 + }, + { + "epoch": 2.2035364457448163, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.864681601524353, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.874068021774292, + "num_tokens": 660870319.0, + "step": 17322 + }, + { + "epoch": 2.203663656023407, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8635776042938232, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.853035569190979, + "num_tokens": 660907313.0, + "step": 17323 + }, + { + "epoch": 2.2037908663019974, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.768964409828186, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8809069395065308, + "num_tokens": 660948087.0, + "step": 17324 + }, + { + "epoch": 2.203918076580588, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2126145362854004, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8706890344619751, + "num_tokens": 660984984.0, + "step": 17325 + }, + { + "epoch": 2.204045286859178, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1346027851104736, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8627883195877075, + "num_tokens": 661022898.0, + "step": 17326 + }, + { + "epoch": 2.2041724971377685, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9146121740341187, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8753209114074707, + "num_tokens": 661058645.0, + "step": 17327 + }, + { + "epoch": 2.204299707416359, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9218425750732422, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8888881206512451, + "num_tokens": 661096954.0, + "step": 17328 + }, + { + "epoch": 2.2044269176949496, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9270997047424316, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8541198968887329, + "num_tokens": 661136947.0, + "step": 17329 + }, + { + "epoch": 2.20455412797354, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8450853824615479, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.867651104927063, + "num_tokens": 661175376.0, + "step": 17330 + }, + { + "epoch": 2.2046813382521306, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 20.474536895751953, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8593015670776367, + "num_tokens": 661213238.0, + "step": 17331 + }, + { + "epoch": 2.204808548530721, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.182307481765747, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8823307752609253, + "num_tokens": 661246512.0, + "step": 17332 + }, + { + "epoch": 2.2049357588093117, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.975748896598816, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8767688870429993, + "num_tokens": 661288272.0, + "step": 17333 + }, + { + "epoch": 2.2050629690879022, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8881360292434692, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8723086714744568, + "num_tokens": 661328705.0, + "step": 17334 + }, + { + "epoch": 2.2051901793664928, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7219924926757812, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.87022465467453, + "num_tokens": 661369877.0, + "step": 17335 + }, + { + "epoch": 2.2053173896450833, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8480879068374634, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8689756393432617, + "num_tokens": 661405209.0, + "step": 17336 + }, + { + "epoch": 2.205444599923674, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9001929759979248, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8709322810173035, + "num_tokens": 661441083.0, + "step": 17337 + }, + { + "epoch": 2.2055718102022643, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8616235256195068, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8685611486434937, + "num_tokens": 661476938.0, + "step": 17338 + }, + { + "epoch": 2.205699020480855, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9170691967010498, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8583693504333496, + "num_tokens": 661519281.0, + "step": 17339 + }, + { + "epoch": 2.2058262307594454, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8515881299972534, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8819384574890137, + "num_tokens": 661555880.0, + "step": 17340 + }, + { + "epoch": 2.205953441038036, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9394406080245972, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8601976633071899, + "num_tokens": 661595937.0, + "step": 17341 + }, + { + "epoch": 2.2060806513166265, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9672813415527344, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8523693084716797, + "num_tokens": 661636868.0, + "step": 17342 + }, + { + "epoch": 2.206207861595217, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.764523983001709, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8593816161155701, + "num_tokens": 661681503.0, + "step": 17343 + }, + { + "epoch": 2.2063350718738075, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8324766159057617, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8688488602638245, + "num_tokens": 661718438.0, + "step": 17344 + }, + { + "epoch": 2.206462282152398, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0100557804107666, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8472352027893066, + "num_tokens": 661759048.0, + "step": 17345 + }, + { + "epoch": 2.2065894924309886, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7734110355377197, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8769347071647644, + "num_tokens": 661803131.0, + "step": 17346 + }, + { + "epoch": 2.206716702709579, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9508910179138184, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8667737245559692, + "num_tokens": 661838139.0, + "step": 17347 + }, + { + "epoch": 2.2068439129881696, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.959469199180603, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8593627214431763, + "num_tokens": 661878568.0, + "step": 17348 + }, + { + "epoch": 2.20697112326676, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0213513374328613, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.859455943107605, + "num_tokens": 661909110.0, + "step": 17349 + }, + { + "epoch": 2.2070983335453507, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9753929376602173, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8800973296165466, + "num_tokens": 661944472.0, + "step": 17350 + }, + { + "epoch": 2.2072255438239408, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8104095458984375, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8786834478378296, + "num_tokens": 661985865.0, + "step": 17351 + }, + { + "epoch": 2.2073527541025313, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.037628173828125, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8611976504325867, + "num_tokens": 662019460.0, + "step": 17352 + }, + { + "epoch": 2.207479964381122, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.918201208114624, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8737356662750244, + "num_tokens": 662059109.0, + "step": 17353 + }, + { + "epoch": 2.2076071746597123, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9362421035766602, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8586071729660034, + "num_tokens": 662096040.0, + "step": 17354 + }, + { + "epoch": 2.207734384938303, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7055736780166626, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8725786209106445, + "num_tokens": 662139351.0, + "step": 17355 + }, + { + "epoch": 2.2078615952168934, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9421119689941406, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8688067197799683, + "num_tokens": 662177353.0, + "step": 17356 + }, + { + "epoch": 2.207988805495484, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.337040901184082, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8555906414985657, + "num_tokens": 662209519.0, + "step": 17357 + }, + { + "epoch": 2.2081160157740745, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9907492399215698, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8471494317054749, + "num_tokens": 662251311.0, + "step": 17358 + }, + { + "epoch": 2.208243226052665, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.098417282104492, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8747062087059021, + "num_tokens": 662282770.0, + "step": 17359 + }, + { + "epoch": 2.2083704363312555, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7866865396499634, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8626859188079834, + "num_tokens": 662326770.0, + "step": 17360 + }, + { + "epoch": 2.208497646609846, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9165529012680054, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8648704290390015, + "num_tokens": 662366811.0, + "step": 17361 + }, + { + "epoch": 2.2086248568884366, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7604209184646606, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.863382875919342, + "num_tokens": 662404517.0, + "step": 17362 + }, + { + "epoch": 2.208752067167027, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9692732095718384, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8711179494857788, + "num_tokens": 662442091.0, + "step": 17363 + }, + { + "epoch": 2.2088792774456176, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.272538423538208, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8415039777755737, + "num_tokens": 662483343.0, + "step": 17364 + }, + { + "epoch": 2.209006487724208, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0058631896972656, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8604556322097778, + "num_tokens": 662521994.0, + "step": 17365 + }, + { + "epoch": 2.2091336980027987, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7510486841201782, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.861830472946167, + "num_tokens": 662562737.0, + "step": 17366 + }, + { + "epoch": 2.209260908281389, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.815075397491455, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8615792989730835, + "num_tokens": 662596927.0, + "step": 17367 + }, + { + "epoch": 2.2093881185599797, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9211981296539307, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8753286600112915, + "num_tokens": 662632831.0, + "step": 17368 + }, + { + "epoch": 2.2095153288385703, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8953583240509033, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8629966974258423, + "num_tokens": 662675554.0, + "step": 17369 + }, + { + "epoch": 2.209642539117161, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.690359115600586, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8788819313049316, + "num_tokens": 662720248.0, + "step": 17370 + }, + { + "epoch": 2.2097697493957513, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7302236557006836, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8662733435630798, + "num_tokens": 662761388.0, + "step": 17371 + }, + { + "epoch": 2.209896959674342, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.917452096939087, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8649905920028687, + "num_tokens": 662798007.0, + "step": 17372 + }, + { + "epoch": 2.2100241699529324, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.799847960472107, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8631142973899841, + "num_tokens": 662836042.0, + "step": 17373 + }, + { + "epoch": 2.210151380231523, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0760254859924316, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8561227321624756, + "num_tokens": 662869465.0, + "step": 17374 + }, + { + "epoch": 2.2102785905101134, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7843595743179321, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8574378490447998, + "num_tokens": 662909440.0, + "step": 17375 + }, + { + "epoch": 2.2104058007887035, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8484714031219482, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8627084493637085, + "num_tokens": 662949637.0, + "step": 17376 + }, + { + "epoch": 2.210533011067294, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.024324893951416, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8633159399032593, + "num_tokens": 662987679.0, + "step": 17377 + }, + { + "epoch": 2.2106602213458846, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.091081380844116, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8460881114006042, + "num_tokens": 663027651.0, + "step": 17378 + }, + { + "epoch": 2.210787431624475, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3103878498077393, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8794450759887695, + "num_tokens": 663063708.0, + "step": 17379 + }, + { + "epoch": 2.2109146419030656, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.047435760498047, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8581076264381409, + "num_tokens": 663097255.0, + "step": 17380 + }, + { + "epoch": 2.211041852181656, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9114866256713867, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8619867563247681, + "num_tokens": 663134496.0, + "step": 17381 + }, + { + "epoch": 2.2111690624602467, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.960974931716919, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.875531792640686, + "num_tokens": 663170278.0, + "step": 17382 + }, + { + "epoch": 2.211296272738837, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9787285327911377, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8639193773269653, + "num_tokens": 663205407.0, + "step": 17383 + }, + { + "epoch": 2.2114234830174277, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8840082883834839, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8714141845703125, + "num_tokens": 663243166.0, + "step": 17384 + }, + { + "epoch": 2.2115506932960183, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.882401466369629, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8879764080047607, + "num_tokens": 663282267.0, + "step": 17385 + }, + { + "epoch": 2.211677903574609, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9583348035812378, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8574110865592957, + "num_tokens": 663318620.0, + "step": 17386 + }, + { + "epoch": 2.2118051138531993, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.039546489715576, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8829936981201172, + "num_tokens": 663353256.0, + "step": 17387 + }, + { + "epoch": 2.21193232413179, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.113818883895874, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8685741424560547, + "num_tokens": 663386691.0, + "step": 17388 + }, + { + "epoch": 2.2120595344103804, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0894181728363037, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8620566725730896, + "num_tokens": 663416550.0, + "step": 17389 + }, + { + "epoch": 2.212186744688971, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8558149337768555, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8680909872055054, + "num_tokens": 663464912.0, + "step": 17390 + }, + { + "epoch": 2.2123139549675614, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9006826877593994, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8568450808525085, + "num_tokens": 663503786.0, + "step": 17391 + }, + { + "epoch": 2.212441165246152, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.944684624671936, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8738781809806824, + "num_tokens": 663540579.0, + "step": 17392 + }, + { + "epoch": 2.2125683755247425, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9389935731887817, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8682025074958801, + "num_tokens": 663577284.0, + "step": 17393 + }, + { + "epoch": 2.212695585803333, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.924436330795288, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8600901365280151, + "num_tokens": 663613118.0, + "step": 17394 + }, + { + "epoch": 2.2128227960819236, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9549654722213745, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8706729412078857, + "num_tokens": 663651548.0, + "step": 17395 + }, + { + "epoch": 2.212950006360514, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8340681791305542, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8478280901908875, + "num_tokens": 663694381.0, + "step": 17396 + }, + { + "epoch": 2.2130772166391046, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7557481527328491, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8687596321105957, + "num_tokens": 663734686.0, + "step": 17397 + }, + { + "epoch": 2.213204426917695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9362112283706665, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8578999042510986, + "num_tokens": 663768764.0, + "step": 17398 + }, + { + "epoch": 2.2133316371962852, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9298243522644043, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8858768939971924, + "num_tokens": 663807650.0, + "step": 17399 + }, + { + "epoch": 2.213458847474876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1094810962677, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8640696406364441, + "num_tokens": 663849308.0, + "step": 17400 + }, + { + "epoch": 2.2135860577534663, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7627589702606201, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8513773083686829, + "num_tokens": 663888917.0, + "step": 17401 + }, + { + "epoch": 2.213713268032057, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1607654094696045, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8667209148406982, + "num_tokens": 663921048.0, + "step": 17402 + }, + { + "epoch": 2.2138404783106473, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.762817621231079, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8711678385734558, + "num_tokens": 663963433.0, + "step": 17403 + }, + { + "epoch": 2.213967688589238, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0235419273376465, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.848211944103241, + "num_tokens": 664003294.0, + "step": 17404 + }, + { + "epoch": 2.2140948988678284, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.939498782157898, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8539115786552429, + "num_tokens": 664043161.0, + "step": 17405 + }, + { + "epoch": 2.214222109146419, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8922333717346191, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8771481513977051, + "num_tokens": 664082457.0, + "step": 17406 + }, + { + "epoch": 2.2143493194250095, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8148558139801025, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8767036199569702, + "num_tokens": 664125999.0, + "step": 17407 + }, + { + "epoch": 2.2144765297036, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8014084100723267, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8720831871032715, + "num_tokens": 664167256.0, + "step": 17408 + }, + { + "epoch": 2.2146037399821905, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9310928583145142, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.863772451877594, + "num_tokens": 664201887.0, + "step": 17409 + }, + { + "epoch": 2.214730950260781, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9561123847961426, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8711931109428406, + "num_tokens": 664238986.0, + "step": 17410 + }, + { + "epoch": 2.2148581605393716, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 16.59210777282715, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8624861240386963, + "num_tokens": 664283436.0, + "step": 17411 + }, + { + "epoch": 2.214985370817962, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.039458751678467, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8590469360351562, + "num_tokens": 664325196.0, + "step": 17412 + }, + { + "epoch": 2.2151125810965526, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1242318153381348, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8573211431503296, + "num_tokens": 664365722.0, + "step": 17413 + }, + { + "epoch": 2.215239791375143, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9264791011810303, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8564919233322144, + "num_tokens": 664411570.0, + "step": 17414 + }, + { + "epoch": 2.2153670016537337, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.139387369155884, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8677635192871094, + "num_tokens": 664453545.0, + "step": 17415 + }, + { + "epoch": 2.215494211932324, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9572391510009766, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8662927150726318, + "num_tokens": 664489882.0, + "step": 17416 + }, + { + "epoch": 2.2156214222109147, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.016312837600708, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8640276789665222, + "num_tokens": 664528720.0, + "step": 17417 + }, + { + "epoch": 2.2157486324895053, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0044710636138916, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8559972047805786, + "num_tokens": 664569645.0, + "step": 17418 + }, + { + "epoch": 2.215875842768096, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7489789724349976, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8613118529319763, + "num_tokens": 664612033.0, + "step": 17419 + }, + { + "epoch": 2.2160030530466863, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8996690511703491, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8723433613777161, + "num_tokens": 664651408.0, + "step": 17420 + }, + { + "epoch": 2.216130263325277, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9187207221984863, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8506221771240234, + "num_tokens": 664695764.0, + "step": 17421 + }, + { + "epoch": 2.2162574736038674, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.858807921409607, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8678629398345947, + "num_tokens": 664735528.0, + "step": 17422 + }, + { + "epoch": 2.216384683882458, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.10394287109375, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8577457070350647, + "num_tokens": 664775612.0, + "step": 17423 + }, + { + "epoch": 2.216511894161048, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9370243549346924, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8615461587905884, + "num_tokens": 664812485.0, + "step": 17424 + }, + { + "epoch": 2.2166391044396385, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9279704093933105, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8505934476852417, + "num_tokens": 664853166.0, + "step": 17425 + }, + { + "epoch": 2.216766314718229, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.021524429321289, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8545395135879517, + "num_tokens": 664888778.0, + "step": 17426 + }, + { + "epoch": 2.2168935249968196, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8301326036453247, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8635557889938354, + "num_tokens": 664933783.0, + "step": 17427 + }, + { + "epoch": 2.21702073527541, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.110377073287964, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8463146686553955, + "num_tokens": 664965228.0, + "step": 17428 + }, + { + "epoch": 2.2171479455540006, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0722594261169434, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8627257347106934, + "num_tokens": 664996691.0, + "step": 17429 + }, + { + "epoch": 2.217275155832591, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8617141246795654, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8748835325241089, + "num_tokens": 665032006.0, + "step": 17430 + }, + { + "epoch": 2.2174023661111817, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.911420226097107, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8545616865158081, + "num_tokens": 665071819.0, + "step": 17431 + }, + { + "epoch": 2.217529576389772, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.728134274482727, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8525644540786743, + "num_tokens": 665118883.0, + "step": 17432 + }, + { + "epoch": 2.2176567866683627, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0038490295410156, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8472768068313599, + "num_tokens": 665154926.0, + "step": 17433 + }, + { + "epoch": 2.2177839969469533, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9693665504455566, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8732839226722717, + "num_tokens": 665189910.0, + "step": 17434 + }, + { + "epoch": 2.217911207225544, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 3.9820504188537598, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8720219731330872, + "num_tokens": 665235849.0, + "step": 17435 + }, + { + "epoch": 2.2180384175041343, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0151870250701904, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8563432693481445, + "num_tokens": 665274231.0, + "step": 17436 + }, + { + "epoch": 2.218165627782725, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9591561555862427, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8830559253692627, + "num_tokens": 665309057.0, + "step": 17437 + }, + { + "epoch": 2.2182928380613154, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2032668590545654, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8612205386161804, + "num_tokens": 665337031.0, + "step": 17438 + }, + { + "epoch": 2.218420048339906, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9472824335098267, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8652512431144714, + "num_tokens": 665376557.0, + "step": 17439 + }, + { + "epoch": 2.2185472586184964, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 80.51927185058594, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8627939224243164, + "num_tokens": 665419928.0, + "step": 17440 + }, + { + "epoch": 2.218674468897087, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.999825358390808, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8715291023254395, + "num_tokens": 665458827.0, + "step": 17441 + }, + { + "epoch": 2.2188016791756775, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.929006576538086, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.866356372833252, + "num_tokens": 665498623.0, + "step": 17442 + }, + { + "epoch": 2.218928889454268, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9557524919509888, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8597116470336914, + "num_tokens": 665536103.0, + "step": 17443 + }, + { + "epoch": 2.2190560997328586, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.886033058166504, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8623000979423523, + "num_tokens": 665572089.0, + "step": 17444 + }, + { + "epoch": 2.219183310011449, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8518024682998657, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8562991619110107, + "num_tokens": 665608088.0, + "step": 17445 + }, + { + "epoch": 2.2193105202900396, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9135831594467163, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8737037181854248, + "num_tokens": 665641883.0, + "step": 17446 + }, + { + "epoch": 2.21943773056863, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8240529298782349, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8686593174934387, + "num_tokens": 665682211.0, + "step": 17447 + }, + { + "epoch": 2.2195649408472207, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7621878385543823, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8697672486305237, + "num_tokens": 665726182.0, + "step": 17448 + }, + { + "epoch": 2.2196921511258108, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1185302734375, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8683174252510071, + "num_tokens": 665760404.0, + "step": 17449 + }, + { + "epoch": 2.2198193614044013, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9570293426513672, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.864169716835022, + "num_tokens": 665798235.0, + "step": 17450 + }, + { + "epoch": 2.219946571682992, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7736766338348389, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8665844202041626, + "num_tokens": 665840118.0, + "step": 17451 + }, + { + "epoch": 2.2200737819615823, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8599966764450073, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8640219569206238, + "num_tokens": 665878498.0, + "step": 17452 + }, + { + "epoch": 2.220200992240173, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7718135118484497, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8608303070068359, + "num_tokens": 665924179.0, + "step": 17453 + }, + { + "epoch": 2.2203282025187634, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8824855089187622, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8798784017562866, + "num_tokens": 665963160.0, + "step": 17454 + }, + { + "epoch": 2.220455412797354, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9858494997024536, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8744316101074219, + "num_tokens": 665998854.0, + "step": 17455 + }, + { + "epoch": 2.2205826230759445, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8927232027053833, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8729442954063416, + "num_tokens": 666036420.0, + "step": 17456 + }, + { + "epoch": 2.220709833354535, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9551210403442383, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8448172211647034, + "num_tokens": 666076692.0, + "step": 17457 + }, + { + "epoch": 2.2208370436331255, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.967532753944397, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8517656922340393, + "num_tokens": 666118009.0, + "step": 17458 + }, + { + "epoch": 2.220964253911716, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9228081703186035, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8546205759048462, + "num_tokens": 666158132.0, + "step": 17459 + }, + { + "epoch": 2.2210914641903066, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.153050422668457, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8598666787147522, + "num_tokens": 666187195.0, + "step": 17460 + }, + { + "epoch": 2.221218674468897, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.349918842315674, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8692579865455627, + "num_tokens": 666227790.0, + "step": 17461 + }, + { + "epoch": 2.2213458847474876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9472166299819946, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8580824136734009, + "num_tokens": 666268517.0, + "step": 17462 + }, + { + "epoch": 2.221473095026078, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.01753830909729, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8812928795814514, + "num_tokens": 666313727.0, + "step": 17463 + }, + { + "epoch": 2.2216003053046687, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9872932434082031, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.867316484451294, + "num_tokens": 666352713.0, + "step": 17464 + }, + { + "epoch": 2.221727515583259, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7974752187728882, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.872856616973877, + "num_tokens": 666395065.0, + "step": 17465 + }, + { + "epoch": 2.2218547258618497, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9936394691467285, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8665038347244263, + "num_tokens": 666428019.0, + "step": 17466 + }, + { + "epoch": 2.2219819361404403, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8605706691741943, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8634957075119019, + "num_tokens": 666466830.0, + "step": 17467 + }, + { + "epoch": 2.222109146419031, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1363210678100586, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8832697868347168, + "num_tokens": 666501028.0, + "step": 17468 + }, + { + "epoch": 2.2222363566976213, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8623082637786865, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8791344165802002, + "num_tokens": 666538554.0, + "step": 17469 + }, + { + "epoch": 2.222363566976212, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9372557401657104, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8652603626251221, + "num_tokens": 666577019.0, + "step": 17470 + }, + { + "epoch": 2.2224907772548024, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9222551584243774, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8787323832511902, + "num_tokens": 666616597.0, + "step": 17471 + }, + { + "epoch": 2.222617987533393, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0823144912719727, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8676629066467285, + "num_tokens": 666651523.0, + "step": 17472 + }, + { + "epoch": 2.2227451978119834, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0792076587677, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8542553186416626, + "num_tokens": 666689270.0, + "step": 17473 + }, + { + "epoch": 2.2228724080905735, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9895747900009155, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8621546030044556, + "num_tokens": 666726836.0, + "step": 17474 + }, + { + "epoch": 2.222999618369164, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1091387271881104, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8623079657554626, + "num_tokens": 666761207.0, + "step": 17475 + }, + { + "epoch": 2.2231268286477546, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0525593757629395, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.873466432094574, + "num_tokens": 666800293.0, + "step": 17476 + }, + { + "epoch": 2.223254038926345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7081886529922485, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.875115156173706, + "num_tokens": 666843046.0, + "step": 17477 + }, + { + "epoch": 2.2233812492049356, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7442001104354858, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8800193071365356, + "num_tokens": 666885840.0, + "step": 17478 + }, + { + "epoch": 2.223508459483526, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0256268978118896, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8662202954292297, + "num_tokens": 666925654.0, + "step": 17479 + }, + { + "epoch": 2.2236356697621167, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.932128667831421, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8671883344650269, + "num_tokens": 666968098.0, + "step": 17480 + }, + { + "epoch": 2.223762880040707, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 16.59429168701172, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8767469525337219, + "num_tokens": 667010349.0, + "step": 17481 + }, + { + "epoch": 2.2238900903192977, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1161000728607178, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8584920167922974, + "num_tokens": 667047086.0, + "step": 17482 + }, + { + "epoch": 2.2240173005978883, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.112318277359009, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8485950827598572, + "num_tokens": 667083955.0, + "step": 17483 + }, + { + "epoch": 2.224144510876479, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0173323154449463, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8710935115814209, + "num_tokens": 667119751.0, + "step": 17484 + }, + { + "epoch": 2.2242717211550693, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0513863563537598, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8599259853363037, + "num_tokens": 667150980.0, + "step": 17485 + }, + { + "epoch": 2.22439893143366, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0133183002471924, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8615590333938599, + "num_tokens": 667185198.0, + "step": 17486 + }, + { + "epoch": 2.2245261417122504, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9111813306808472, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8677316904067993, + "num_tokens": 667225286.0, + "step": 17487 + }, + { + "epoch": 2.224653351990841, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8449617624282837, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8600862622261047, + "num_tokens": 667266294.0, + "step": 17488 + }, + { + "epoch": 2.2247805622694314, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0396227836608887, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8652215003967285, + "num_tokens": 667302430.0, + "step": 17489 + }, + { + "epoch": 2.224907772548022, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8423374891281128, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8596738576889038, + "num_tokens": 667342881.0, + "step": 17490 + }, + { + "epoch": 2.2250349828266125, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8371580839157104, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8668646812438965, + "num_tokens": 667376122.0, + "step": 17491 + }, + { + "epoch": 2.225162193105203, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8822028636932373, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8581581711769104, + "num_tokens": 667415120.0, + "step": 17492 + }, + { + "epoch": 2.2252894033837936, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9301691055297852, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8644329309463501, + "num_tokens": 667451014.0, + "step": 17493 + }, + { + "epoch": 2.225416613662384, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0399200916290283, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8567759394645691, + "num_tokens": 667485600.0, + "step": 17494 + }, + { + "epoch": 2.2255438239409746, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0250296592712402, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8599057197570801, + "num_tokens": 667527561.0, + "step": 17495 + }, + { + "epoch": 2.225671034219565, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9014379978179932, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8725537657737732, + "num_tokens": 667569677.0, + "step": 17496 + }, + { + "epoch": 2.225798244498155, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0008466243743896, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8645793199539185, + "num_tokens": 667605954.0, + "step": 17497 + }, + { + "epoch": 2.225925454776746, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.004443883895874, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8687621355056763, + "num_tokens": 667645082.0, + "step": 17498 + }, + { + "epoch": 2.2260526650553363, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8885678052902222, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.862723708152771, + "num_tokens": 667681390.0, + "step": 17499 + }, + { + "epoch": 2.226179875333927, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1075778007507324, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8554142713546753, + "num_tokens": 667722772.0, + "step": 17500 + }, + { + "epoch": 2.2263070856125173, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9721556901931763, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8418015241622925, + "num_tokens": 667758647.0, + "step": 17501 + }, + { + "epoch": 2.226434295891108, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.062011241912842, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8664205074310303, + "num_tokens": 667790625.0, + "step": 17502 + }, + { + "epoch": 2.2265615061696984, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.008927345275879, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8537701368331909, + "num_tokens": 667823724.0, + "step": 17503 + }, + { + "epoch": 2.226688716448289, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8359609842300415, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8776894807815552, + "num_tokens": 667862548.0, + "step": 17504 + }, + { + "epoch": 2.2268159267268794, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8372606039047241, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8575937747955322, + "num_tokens": 667905994.0, + "step": 17505 + }, + { + "epoch": 2.22694313700547, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.226768970489502, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8587262630462646, + "num_tokens": 667945087.0, + "step": 17506 + }, + { + "epoch": 2.2270703472840605, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.085310935974121, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8747373819351196, + "num_tokens": 667984736.0, + "step": 17507 + }, + { + "epoch": 2.227197557562651, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9207541942596436, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8586163520812988, + "num_tokens": 668024020.0, + "step": 17508 + }, + { + "epoch": 2.2273247678412416, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9703960418701172, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8601895570755005, + "num_tokens": 668059567.0, + "step": 17509 + }, + { + "epoch": 2.227451978119832, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8543825149536133, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.871830403804779, + "num_tokens": 668099901.0, + "step": 17510 + }, + { + "epoch": 2.2275791883984226, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7333475351333618, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8792308568954468, + "num_tokens": 668135974.0, + "step": 17511 + }, + { + "epoch": 2.227706398677013, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8643096685409546, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8685956001281738, + "num_tokens": 668179200.0, + "step": 17512 + }, + { + "epoch": 2.2278336089556037, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9580647945404053, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8432759642601013, + "num_tokens": 668219683.0, + "step": 17513 + }, + { + "epoch": 2.227960819234194, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.015624523162842, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8670132160186768, + "num_tokens": 668253899.0, + "step": 17514 + }, + { + "epoch": 2.2280880295127847, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.393890142440796, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8581227660179138, + "num_tokens": 668294140.0, + "step": 17515 + }, + { + "epoch": 2.2282152397913753, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.020587205886841, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8692252039909363, + "num_tokens": 668328442.0, + "step": 17516 + }, + { + "epoch": 2.228342450069966, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9322967529296875, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8675804138183594, + "num_tokens": 668367903.0, + "step": 17517 + }, + { + "epoch": 2.2284696603485563, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8105162382125854, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8763040900230408, + "num_tokens": 668409889.0, + "step": 17518 + }, + { + "epoch": 2.228596870627147, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.63096809387207, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8697673082351685, + "num_tokens": 668451061.0, + "step": 17519 + }, + { + "epoch": 2.2287240809057374, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.361517906188965, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8425359725952148, + "num_tokens": 668493038.0, + "step": 17520 + }, + { + "epoch": 2.228851291184328, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.068594455718994, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8727540969848633, + "num_tokens": 668528087.0, + "step": 17521 + }, + { + "epoch": 2.228978501462918, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9445867538452148, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.866647481918335, + "num_tokens": 668562325.0, + "step": 17522 + }, + { + "epoch": 2.2291057117415085, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8183848857879639, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8661097884178162, + "num_tokens": 668601336.0, + "step": 17523 + }, + { + "epoch": 2.229232922020099, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.957103967666626, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8847296237945557, + "num_tokens": 668637113.0, + "step": 17524 + }, + { + "epoch": 2.2293601322986896, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.034895658493042, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8606202602386475, + "num_tokens": 668675645.0, + "step": 17525 + }, + { + "epoch": 2.22948734257728, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9665781259536743, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8665502667427063, + "num_tokens": 668706480.0, + "step": 17526 + }, + { + "epoch": 2.2296145528558706, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9453506469726562, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8591545820236206, + "num_tokens": 668743956.0, + "step": 17527 + }, + { + "epoch": 2.229741763134461, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.038080930709839, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8736653327941895, + "num_tokens": 668779134.0, + "step": 17528 + }, + { + "epoch": 2.2298689734130517, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1770708560943604, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8677501678466797, + "num_tokens": 668814978.0, + "step": 17529 + }, + { + "epoch": 2.229996183691642, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0645790100097656, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8491995334625244, + "num_tokens": 668854194.0, + "step": 17530 + }, + { + "epoch": 2.2301233939702327, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.916740894317627, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8563521504402161, + "num_tokens": 668893095.0, + "step": 17531 + }, + { + "epoch": 2.2302506042488233, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8706899881362915, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8735870122909546, + "num_tokens": 668930626.0, + "step": 17532 + }, + { + "epoch": 2.230377814527414, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9614224433898926, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8742140531539917, + "num_tokens": 668960941.0, + "step": 17533 + }, + { + "epoch": 2.2305050248060043, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9276056289672852, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8676875233650208, + "num_tokens": 668997771.0, + "step": 17534 + }, + { + "epoch": 2.230632235084595, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.809085488319397, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8786424398422241, + "num_tokens": 669038714.0, + "step": 17535 + }, + { + "epoch": 2.2307594453631854, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.008030414581299, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8731808662414551, + "num_tokens": 669072336.0, + "step": 17536 + }, + { + "epoch": 2.230886655641776, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.027690887451172, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8738490343093872, + "num_tokens": 669110303.0, + "step": 17537 + }, + { + "epoch": 2.2310138659203664, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1254353523254395, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8565694689750671, + "num_tokens": 669146874.0, + "step": 17538 + }, + { + "epoch": 2.231141076198957, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9759877920150757, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8771247863769531, + "num_tokens": 669181682.0, + "step": 17539 + }, + { + "epoch": 2.2312682864775475, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6860262155532837, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8665531277656555, + "num_tokens": 669227201.0, + "step": 17540 + }, + { + "epoch": 2.231395496756138, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7456014156341553, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8669727444648743, + "num_tokens": 669265915.0, + "step": 17541 + }, + { + "epoch": 2.2315227070347285, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9845011234283447, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8700559139251709, + "num_tokens": 669304130.0, + "step": 17542 + }, + { + "epoch": 2.231649917313319, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9543664455413818, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8636581897735596, + "num_tokens": 669344225.0, + "step": 17543 + }, + { + "epoch": 2.2317771275919096, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9076939821243286, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.847099781036377, + "num_tokens": 669385126.0, + "step": 17544 + }, + { + "epoch": 2.2319043378705, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.98248291015625, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.85443115234375, + "num_tokens": 669425774.0, + "step": 17545 + }, + { + "epoch": 2.2320315481490907, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.234591484069824, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8549147844314575, + "num_tokens": 669459789.0, + "step": 17546 + }, + { + "epoch": 2.2321587584276807, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9599616527557373, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8816031217575073, + "num_tokens": 669498709.0, + "step": 17547 + }, + { + "epoch": 2.2322859687062713, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8992522954940796, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8705217838287354, + "num_tokens": 669536021.0, + "step": 17548 + }, + { + "epoch": 2.232413178984862, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9040290117263794, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.853266716003418, + "num_tokens": 669574316.0, + "step": 17549 + }, + { + "epoch": 2.2325403892634523, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.842484712600708, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8561537861824036, + "num_tokens": 669611826.0, + "step": 17550 + }, + { + "epoch": 2.232667599542043, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7325594425201416, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8622918725013733, + "num_tokens": 669655277.0, + "step": 17551 + }, + { + "epoch": 2.2327948098206334, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8251031637191772, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8590733408927917, + "num_tokens": 669693481.0, + "step": 17552 + }, + { + "epoch": 2.232922020099224, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2302401065826416, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8643670082092285, + "num_tokens": 669724104.0, + "step": 17553 + }, + { + "epoch": 2.2330492303778144, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0008411407470703, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8494117259979248, + "num_tokens": 669768725.0, + "step": 17554 + }, + { + "epoch": 2.233176440656405, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.778271198272705, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8748468160629272, + "num_tokens": 669809508.0, + "step": 17555 + }, + { + "epoch": 2.2333036509349955, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.8762919902801514, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8666951656341553, + "num_tokens": 669848047.0, + "step": 17556 + }, + { + "epoch": 2.233430861213586, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0028209686279297, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8815866708755493, + "num_tokens": 669881274.0, + "step": 17557 + }, + { + "epoch": 2.2335580714921766, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9885034561157227, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8591756224632263, + "num_tokens": 669918517.0, + "step": 17558 + }, + { + "epoch": 2.233685281770767, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1484594345092773, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8576041460037231, + "num_tokens": 669960990.0, + "step": 17559 + }, + { + "epoch": 2.2338124920493576, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0191352367401123, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8684636354446411, + "num_tokens": 669995542.0, + "step": 17560 + }, + { + "epoch": 2.233939702327948, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7418321371078491, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8650838732719421, + "num_tokens": 670036798.0, + "step": 17561 + }, + { + "epoch": 2.2340669126065387, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9822391271591187, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8645741939544678, + "num_tokens": 670071763.0, + "step": 17562 + }, + { + "epoch": 2.234194122885129, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.7734572887420654, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.86324143409729, + "num_tokens": 670100864.0, + "step": 17563 + }, + { + "epoch": 2.2343213331637197, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9235560894012451, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8537474870681763, + "num_tokens": 670144114.0, + "step": 17564 + }, + { + "epoch": 2.2344485434423103, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8491475582122803, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8698952794075012, + "num_tokens": 670182719.0, + "step": 17565 + }, + { + "epoch": 2.234575753720901, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.202148675918579, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.868091344833374, + "num_tokens": 670218059.0, + "step": 17566 + }, + { + "epoch": 2.2347029639994913, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9291298389434814, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.861663818359375, + "num_tokens": 670259589.0, + "step": 17567 + }, + { + "epoch": 2.234830174278082, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8412773609161377, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8565433025360107, + "num_tokens": 670301755.0, + "step": 17568 + }, + { + "epoch": 2.2349573845566724, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8261545896530151, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8695195913314819, + "num_tokens": 670339205.0, + "step": 17569 + }, + { + "epoch": 2.235084594835263, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7921651601791382, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8685240149497986, + "num_tokens": 670382961.0, + "step": 17570 + }, + { + "epoch": 2.2352118051138534, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0228934288024902, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8547036647796631, + "num_tokens": 670420747.0, + "step": 17571 + }, + { + "epoch": 2.2353390153924435, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.87086820602417, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8809283375740051, + "num_tokens": 670457906.0, + "step": 17572 + }, + { + "epoch": 2.235466225671034, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9835023880004883, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8538217544555664, + "num_tokens": 670493187.0, + "step": 17573 + }, + { + "epoch": 2.2355934359496246, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.846534013748169, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8778060674667358, + "num_tokens": 670529750.0, + "step": 17574 + }, + { + "epoch": 2.235720646228215, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1716437339782715, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8618478178977966, + "num_tokens": 670570147.0, + "step": 17575 + }, + { + "epoch": 2.2358478565068056, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9910856485366821, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8670687675476074, + "num_tokens": 670604113.0, + "step": 17576 + }, + { + "epoch": 2.235975066785396, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9748567342758179, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.865921676158905, + "num_tokens": 670645451.0, + "step": 17577 + }, + { + "epoch": 2.2361022770639867, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9856010675430298, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.870806097984314, + "num_tokens": 670684055.0, + "step": 17578 + }, + { + "epoch": 2.236229487342577, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9355100393295288, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8563101291656494, + "num_tokens": 670724397.0, + "step": 17579 + }, + { + "epoch": 2.2363566976211677, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8591406345367432, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8767653703689575, + "num_tokens": 670761282.0, + "step": 17580 + }, + { + "epoch": 2.2364839078997583, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9288638830184937, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8700727224349976, + "num_tokens": 670800007.0, + "step": 17581 + }, + { + "epoch": 2.236611118178349, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9660956859588623, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8622152805328369, + "num_tokens": 670840040.0, + "step": 17582 + }, + { + "epoch": 2.2367383284569393, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5891666412353516, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8581399917602539, + "num_tokens": 670881193.0, + "step": 17583 + }, + { + "epoch": 2.23686553873553, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7866231203079224, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8694895505905151, + "num_tokens": 670927275.0, + "step": 17584 + }, + { + "epoch": 2.2369927490141204, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.801905632019043, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8688070178031921, + "num_tokens": 670967327.0, + "step": 17585 + }, + { + "epoch": 2.237119959292711, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8992280960083008, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.86964350938797, + "num_tokens": 671009250.0, + "step": 17586 + }, + { + "epoch": 2.2372471695713014, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9176346063613892, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8680354356765747, + "num_tokens": 671045068.0, + "step": 17587 + }, + { + "epoch": 2.237374379849892, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9322950839996338, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8529982566833496, + "num_tokens": 671084786.0, + "step": 17588 + }, + { + "epoch": 2.2375015901284825, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.019683599472046, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8575830459594727, + "num_tokens": 671123235.0, + "step": 17589 + }, + { + "epoch": 2.237628800407073, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9548205137252808, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8806184530258179, + "num_tokens": 671161702.0, + "step": 17590 + }, + { + "epoch": 2.2377560106856635, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9748696088790894, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8768983483314514, + "num_tokens": 671195487.0, + "step": 17591 + }, + { + "epoch": 2.237883220964254, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0074734687805176, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8637025356292725, + "num_tokens": 671232288.0, + "step": 17592 + }, + { + "epoch": 2.2380104312428446, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.055899143218994, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.849082350730896, + "num_tokens": 671275154.0, + "step": 17593 + }, + { + "epoch": 2.238137641521435, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.001774311065674, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8554511070251465, + "num_tokens": 671312266.0, + "step": 17594 + }, + { + "epoch": 2.238264851800025, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.906842827796936, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8648355007171631, + "num_tokens": 671349690.0, + "step": 17595 + }, + { + "epoch": 2.238392062078616, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8575844764709473, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8679488897323608, + "num_tokens": 671387898.0, + "step": 17596 + }, + { + "epoch": 2.2385192723572063, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0897130966186523, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.843217134475708, + "num_tokens": 671428054.0, + "step": 17597 + }, + { + "epoch": 2.238646482635797, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9031434059143066, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.871486485004425, + "num_tokens": 671467096.0, + "step": 17598 + }, + { + "epoch": 2.2387736929143873, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.93022620677948, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8654669523239136, + "num_tokens": 671506752.0, + "step": 17599 + }, + { + "epoch": 2.238900903192978, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9562652111053467, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.85774165391922, + "num_tokens": 671542403.0, + "step": 17600 + }, + { + "epoch": 2.2390281134715684, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9319660663604736, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8760784864425659, + "num_tokens": 671578115.0, + "step": 17601 + }, + { + "epoch": 2.239155323750159, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8190664052963257, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8775492906570435, + "num_tokens": 671619854.0, + "step": 17602 + }, + { + "epoch": 2.2392825340287494, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7702635526657104, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8790525794029236, + "num_tokens": 671661617.0, + "step": 17603 + }, + { + "epoch": 2.23940974430734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.977177381515503, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8542921543121338, + "num_tokens": 671702231.0, + "step": 17604 + }, + { + "epoch": 2.2395369545859305, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8940073251724243, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8655776977539062, + "num_tokens": 671741483.0, + "step": 17605 + }, + { + "epoch": 2.239664164864521, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0168848037719727, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8601582050323486, + "num_tokens": 671780168.0, + "step": 17606 + }, + { + "epoch": 2.2397913751431116, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9697351455688477, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8418997526168823, + "num_tokens": 671818980.0, + "step": 17607 + }, + { + "epoch": 2.239918585421702, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0658130645751953, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8631855845451355, + "num_tokens": 671850010.0, + "step": 17608 + }, + { + "epoch": 2.2400457957002926, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9122726917266846, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8574855327606201, + "num_tokens": 671888506.0, + "step": 17609 + }, + { + "epoch": 2.240173005978883, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.767516851425171, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8747681379318237, + "num_tokens": 671930965.0, + "step": 17610 + }, + { + "epoch": 2.2403002162574737, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.046322822570801, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8737166523933411, + "num_tokens": 671967472.0, + "step": 17611 + }, + { + "epoch": 2.240427426536064, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8779046535491943, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8881590366363525, + "num_tokens": 672005986.0, + "step": 17612 + }, + { + "epoch": 2.2405546368146547, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9151734113693237, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8646763563156128, + "num_tokens": 672044729.0, + "step": 17613 + }, + { + "epoch": 2.2406818470932452, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8721412420272827, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8652877807617188, + "num_tokens": 672080141.0, + "step": 17614 + }, + { + "epoch": 2.2408090573718358, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8734086751937866, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8724701404571533, + "num_tokens": 672119277.0, + "step": 17615 + }, + { + "epoch": 2.2409362676504263, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.015334367752075, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8631778359413147, + "num_tokens": 672154205.0, + "step": 17616 + }, + { + "epoch": 2.241063477929017, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8294949531555176, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8642411828041077, + "num_tokens": 672193667.0, + "step": 17617 + }, + { + "epoch": 2.2411906882076074, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.299710988998413, + "learning_rate": 1e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8360581398010254, + "num_tokens": 672232360.0, + "step": 17618 + }, + { + "epoch": 2.241317898486198, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.9832231998443604, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8521192073822021, + "num_tokens": 672264127.0, + "step": 17619 + }, + { + "epoch": 2.241445108764788, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9310603141784668, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8728867769241333, + "num_tokens": 672300719.0, + "step": 17620 + }, + { + "epoch": 2.2415723190433785, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8335306644439697, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8759490251541138, + "num_tokens": 672342796.0, + "step": 17621 + }, + { + "epoch": 2.241699529321969, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9116337299346924, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8679091930389404, + "num_tokens": 672382597.0, + "step": 17622 + }, + { + "epoch": 2.2418267396005596, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2187206745147705, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8530555963516235, + "num_tokens": 672427350.0, + "step": 17623 + }, + { + "epoch": 2.24195394987915, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.6169469356536865, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.858008086681366, + "num_tokens": 672468997.0, + "step": 17624 + }, + { + "epoch": 2.2420811601577406, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9774750471115112, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8706717491149902, + "num_tokens": 672505061.0, + "step": 17625 + }, + { + "epoch": 2.242208370436331, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.997479796409607, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8643508553504944, + "num_tokens": 672544241.0, + "step": 17626 + }, + { + "epoch": 2.2423355807149217, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 8.616289138793945, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8634935617446899, + "num_tokens": 672586334.0, + "step": 17627 + }, + { + "epoch": 2.242462790993512, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1353797912597656, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8608613014221191, + "num_tokens": 672628627.0, + "step": 17628 + }, + { + "epoch": 2.2425900012721027, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 7.832006931304932, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8551969528198242, + "num_tokens": 672663046.0, + "step": 17629 + }, + { + "epoch": 2.2427172115506933, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9455188512802124, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8714243769645691, + "num_tokens": 672703394.0, + "step": 17630 + }, + { + "epoch": 2.242844421829284, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8370461463928223, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8549157381057739, + "num_tokens": 672743262.0, + "step": 17631 + }, + { + "epoch": 2.2429716321078743, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8743705749511719, + "learning_rate": 1e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.8311705589294434, + "num_tokens": 672782662.0, + "step": 17632 + }, + { + "epoch": 2.243098842386465, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.057372808456421, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8631648421287537, + "num_tokens": 672818400.0, + "step": 17633 + }, + { + "epoch": 2.2432260526650554, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.855198860168457, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8774256706237793, + "num_tokens": 672858843.0, + "step": 17634 + }, + { + "epoch": 2.243353262943646, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.989274024963379, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8666874170303345, + "num_tokens": 672900397.0, + "step": 17635 + }, + { + "epoch": 2.2434804732222364, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0011062622070312, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8640813231468201, + "num_tokens": 672936025.0, + "step": 17636 + }, + { + "epoch": 2.243607683500827, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9445890188217163, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8465266227722168, + "num_tokens": 672979180.0, + "step": 17637 + }, + { + "epoch": 2.2437348937794175, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8933895826339722, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8752814531326294, + "num_tokens": 673018053.0, + "step": 17638 + }, + { + "epoch": 2.243862104058008, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9304465055465698, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8451532125473022, + "num_tokens": 673057596.0, + "step": 17639 + }, + { + "epoch": 2.2439893143365985, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3496274948120117, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8767397403717041, + "num_tokens": 673097740.0, + "step": 17640 + }, + { + "epoch": 2.244116524615189, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6878854036331177, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8702692985534668, + "num_tokens": 673141821.0, + "step": 17641 + }, + { + "epoch": 2.2442437348937796, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7167737483978271, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.87408047914505, + "num_tokens": 673184279.0, + "step": 17642 + }, + { + "epoch": 2.24437094517237, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1923468112945557, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8747153282165527, + "num_tokens": 673224328.0, + "step": 17643 + }, + { + "epoch": 2.2444981554509607, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.936408281326294, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8505123853683472, + "num_tokens": 673263328.0, + "step": 17644 + }, + { + "epoch": 2.2446253657295507, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.927809238433838, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8704085350036621, + "num_tokens": 673301663.0, + "step": 17645 + }, + { + "epoch": 2.2447525760081413, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8989384174346924, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8623445630073547, + "num_tokens": 673340606.0, + "step": 17646 + }, + { + "epoch": 2.244879786286732, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 7.715332984924316, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8611614108085632, + "num_tokens": 673382201.0, + "step": 17647 + }, + { + "epoch": 2.2450069965653223, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 3.0348072052001953, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8746435642242432, + "num_tokens": 673423330.0, + "step": 17648 + }, + { + "epoch": 2.245134206843913, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8800736665725708, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8817847371101379, + "num_tokens": 673459690.0, + "step": 17649 + }, + { + "epoch": 2.2452614171225034, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1681277751922607, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8615032434463501, + "num_tokens": 673494827.0, + "step": 17650 + }, + { + "epoch": 2.245388627401094, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9100302457809448, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8534934520721436, + "num_tokens": 673540568.0, + "step": 17651 + }, + { + "epoch": 2.2455158376796844, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9812602996826172, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8808892965316772, + "num_tokens": 673574022.0, + "step": 17652 + }, + { + "epoch": 2.245643047958275, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8631559610366821, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8608882427215576, + "num_tokens": 673612493.0, + "step": 17653 + }, + { + "epoch": 2.2457702582368655, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8028943538665771, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.862238883972168, + "num_tokens": 673653729.0, + "step": 17654 + }, + { + "epoch": 2.245897468515456, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7619813680648804, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8764381408691406, + "num_tokens": 673695626.0, + "step": 17655 + }, + { + "epoch": 2.2460246787940465, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9586493968963623, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8721795082092285, + "num_tokens": 673732188.0, + "step": 17656 + }, + { + "epoch": 2.246151889072637, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8317160606384277, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8571716547012329, + "num_tokens": 673770259.0, + "step": 17657 + }, + { + "epoch": 2.2462790993512276, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9755228757858276, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8615628480911255, + "num_tokens": 673805018.0, + "step": 17658 + }, + { + "epoch": 2.246406309629818, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.975891351699829, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8577055335044861, + "num_tokens": 673846058.0, + "step": 17659 + }, + { + "epoch": 2.2465335199084087, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0712971687316895, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8523560762405396, + "num_tokens": 673886437.0, + "step": 17660 + }, + { + "epoch": 2.246660730186999, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8346459865570068, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8685331344604492, + "num_tokens": 673925701.0, + "step": 17661 + }, + { + "epoch": 2.2467879404655897, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7972424030303955, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8685777187347412, + "num_tokens": 673969854.0, + "step": 17662 + }, + { + "epoch": 2.2469151507441802, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9141596555709839, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8664675951004028, + "num_tokens": 674005048.0, + "step": 17663 + }, + { + "epoch": 2.2470423610227708, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8318434953689575, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8600201606750488, + "num_tokens": 674046548.0, + "step": 17664 + }, + { + "epoch": 2.2471695713013613, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8865631818771362, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8596134185791016, + "num_tokens": 674086905.0, + "step": 17665 + }, + { + "epoch": 2.247296781579952, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8063719272613525, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8745958805084229, + "num_tokens": 674124589.0, + "step": 17666 + }, + { + "epoch": 2.2474239918585424, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.792743444442749, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8769999146461487, + "num_tokens": 674165135.0, + "step": 17667 + }, + { + "epoch": 2.247551202137133, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8695247173309326, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8599054217338562, + "num_tokens": 674207269.0, + "step": 17668 + }, + { + "epoch": 2.2476784124157234, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8594915866851807, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8819935321807861, + "num_tokens": 674240806.0, + "step": 17669 + }, + { + "epoch": 2.2478056226943135, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0090138912200928, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.855959415435791, + "num_tokens": 674274697.0, + "step": 17670 + }, + { + "epoch": 2.247932832972904, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9209299087524414, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8567398190498352, + "num_tokens": 674311362.0, + "step": 17671 + }, + { + "epoch": 2.2480600432514946, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8622640371322632, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8713115453720093, + "num_tokens": 674352505.0, + "step": 17672 + }, + { + "epoch": 2.248187253530085, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.987281084060669, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8695641160011292, + "num_tokens": 674383938.0, + "step": 17673 + }, + { + "epoch": 2.2483144638086756, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9125345945358276, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8631002306938171, + "num_tokens": 674421665.0, + "step": 17674 + }, + { + "epoch": 2.248441674087266, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5155627727508545, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8676697015762329, + "num_tokens": 674466065.0, + "step": 17675 + }, + { + "epoch": 2.2485688843658567, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7677146196365356, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8660885691642761, + "num_tokens": 674506868.0, + "step": 17676 + }, + { + "epoch": 2.248696094644447, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8277947902679443, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8594791889190674, + "num_tokens": 674545756.0, + "step": 17677 + }, + { + "epoch": 2.2488233049230377, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8014994859695435, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8611912727355957, + "num_tokens": 674590680.0, + "step": 17678 + }, + { + "epoch": 2.2489505152016283, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9072790145874023, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8707129955291748, + "num_tokens": 674631624.0, + "step": 17679 + }, + { + "epoch": 2.249077725480219, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9357925653457642, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8605130910873413, + "num_tokens": 674667365.0, + "step": 17680 + }, + { + "epoch": 2.2492049357588093, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8987585306167603, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.863684892654419, + "num_tokens": 674706587.0, + "step": 17681 + }, + { + "epoch": 2.2493321460374, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9270453453063965, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8631311655044556, + "num_tokens": 674744480.0, + "step": 17682 + }, + { + "epoch": 2.2494593563159904, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8542003631591797, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.865752637386322, + "num_tokens": 674783104.0, + "step": 17683 + }, + { + "epoch": 2.249586566594581, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8793193101882935, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8576961755752563, + "num_tokens": 674823488.0, + "step": 17684 + }, + { + "epoch": 2.2497137768731714, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9144965410232544, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8705666065216064, + "num_tokens": 674857322.0, + "step": 17685 + }, + { + "epoch": 2.249840987151762, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8679746389389038, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8553395867347717, + "num_tokens": 674901566.0, + "step": 17686 + }, + { + "epoch": 2.2499681974303525, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9562711715698242, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8697165250778198, + "num_tokens": 674942803.0, + "step": 17687 + }, + { + "epoch": 2.250095407708943, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8903745412826538, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8555474281311035, + "num_tokens": 674983160.0, + "step": 17688 + }, + { + "epoch": 2.2502226179875335, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9257220029830933, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8661360144615173, + "num_tokens": 675021243.0, + "step": 17689 + }, + { + "epoch": 2.250349828266124, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.022031307220459, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8612624406814575, + "num_tokens": 675059435.0, + "step": 17690 + }, + { + "epoch": 2.2504770385447146, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1300036907196045, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8680427074432373, + "num_tokens": 675091689.0, + "step": 17691 + }, + { + "epoch": 2.250604248823305, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9319205284118652, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8595668077468872, + "num_tokens": 675130368.0, + "step": 17692 + }, + { + "epoch": 2.250731459101895, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8369369506835938, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8677248954772949, + "num_tokens": 675171745.0, + "step": 17693 + }, + { + "epoch": 2.250858669380486, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9434128999710083, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8635764122009277, + "num_tokens": 675206052.0, + "step": 17694 + }, + { + "epoch": 2.2509858796590763, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9353249073028564, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.859423816204071, + "num_tokens": 675243231.0, + "step": 17695 + }, + { + "epoch": 2.251113089937667, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.194817066192627, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8576154708862305, + "num_tokens": 675273582.0, + "step": 17696 + }, + { + "epoch": 2.2512403002162573, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8716496229171753, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8618264198303223, + "num_tokens": 675314247.0, + "step": 17697 + }, + { + "epoch": 2.251367510494848, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9599064588546753, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8659249544143677, + "num_tokens": 675355675.0, + "step": 17698 + }, + { + "epoch": 2.2514947207734384, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1151909828186035, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8622259497642517, + "num_tokens": 675394290.0, + "step": 17699 + }, + { + "epoch": 2.251621931052029, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9795202016830444, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8679717779159546, + "num_tokens": 675434251.0, + "step": 17700 + }, + { + "epoch": 2.2517491413306194, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7609919309616089, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8712053298950195, + "num_tokens": 675474731.0, + "step": 17701 + }, + { + "epoch": 2.25187635160921, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0456480979919434, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8607194423675537, + "num_tokens": 675515180.0, + "step": 17702 + }, + { + "epoch": 2.2520035618878005, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9834623336791992, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8522602319717407, + "num_tokens": 675553429.0, + "step": 17703 + }, + { + "epoch": 2.252130772166391, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9977526664733887, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8679546117782593, + "num_tokens": 675592919.0, + "step": 17704 + }, + { + "epoch": 2.2522579824449815, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.963431715965271, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8730206489562988, + "num_tokens": 675627528.0, + "step": 17705 + }, + { + "epoch": 2.252385192723572, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8200773000717163, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8740806579589844, + "num_tokens": 675663721.0, + "step": 17706 + }, + { + "epoch": 2.2525124030021626, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9430947303771973, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8587338328361511, + "num_tokens": 675705187.0, + "step": 17707 + }, + { + "epoch": 2.252639613280753, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9600694179534912, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.875564694404602, + "num_tokens": 675752713.0, + "step": 17708 + }, + { + "epoch": 2.2527668235593437, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.043482780456543, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8645758032798767, + "num_tokens": 675791009.0, + "step": 17709 + }, + { + "epoch": 2.252894033837934, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.165247678756714, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8682093024253845, + "num_tokens": 675828890.0, + "step": 17710 + }, + { + "epoch": 2.2530212441165247, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.044506072998047, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8657947182655334, + "num_tokens": 675862855.0, + "step": 17711 + }, + { + "epoch": 2.2531484543951152, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9440478086471558, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8624869585037231, + "num_tokens": 675903032.0, + "step": 17712 + }, + { + "epoch": 2.2532756646737058, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8825998306274414, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8596863746643066, + "num_tokens": 675943035.0, + "step": 17713 + }, + { + "epoch": 2.2534028749522963, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.899387240409851, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.867935299873352, + "num_tokens": 675983073.0, + "step": 17714 + }, + { + "epoch": 2.253530085230887, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9242311716079712, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8703299164772034, + "num_tokens": 676022341.0, + "step": 17715 + }, + { + "epoch": 2.2536572955094774, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.943918228149414, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8626159429550171, + "num_tokens": 676060907.0, + "step": 17716 + }, + { + "epoch": 2.253784505788068, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5055878162384033, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8630834817886353, + "num_tokens": 676100693.0, + "step": 17717 + }, + { + "epoch": 2.253911716066658, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8972097635269165, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8794227838516235, + "num_tokens": 676137526.0, + "step": 17718 + }, + { + "epoch": 2.254038926345249, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8243869543075562, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8673757910728455, + "num_tokens": 676177245.0, + "step": 17719 + }, + { + "epoch": 2.254166136623839, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9146809577941895, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8722615242004395, + "num_tokens": 676214568.0, + "step": 17720 + }, + { + "epoch": 2.2542933469024296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8535869121551514, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8669860363006592, + "num_tokens": 676252552.0, + "step": 17721 + }, + { + "epoch": 2.25442055718102, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.908684253692627, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8467015027999878, + "num_tokens": 676290480.0, + "step": 17722 + }, + { + "epoch": 2.2545477674596106, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.6055123805999756, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8766324520111084, + "num_tokens": 676324950.0, + "step": 17723 + }, + { + "epoch": 2.254674977738201, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.923789143562317, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8577660918235779, + "num_tokens": 676363851.0, + "step": 17724 + }, + { + "epoch": 2.2548021880167917, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8452028036117554, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8743748664855957, + "num_tokens": 676399920.0, + "step": 17725 + }, + { + "epoch": 2.254929398295382, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3627471923828125, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8627063035964966, + "num_tokens": 676443243.0, + "step": 17726 + }, + { + "epoch": 2.2550566085739727, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1650943756103516, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.852186918258667, + "num_tokens": 676480272.0, + "step": 17727 + }, + { + "epoch": 2.2551838188525632, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9934014081954956, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8615413308143616, + "num_tokens": 676516630.0, + "step": 17728 + }, + { + "epoch": 2.2553110291311538, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.00408673286438, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8690203428268433, + "num_tokens": 676557842.0, + "step": 17729 + }, + { + "epoch": 2.2554382394097443, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7169148921966553, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8750333189964294, + "num_tokens": 676599413.0, + "step": 17730 + }, + { + "epoch": 2.255565449688335, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9398061037063599, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8566291332244873, + "num_tokens": 676637791.0, + "step": 17731 + }, + { + "epoch": 2.2556926599669254, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7421587705612183, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8598492741584778, + "num_tokens": 676677108.0, + "step": 17732 + }, + { + "epoch": 2.255819870245516, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2272605895996094, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8688732385635376, + "num_tokens": 676720698.0, + "step": 17733 + }, + { + "epoch": 2.2559470805241064, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7733935117721558, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8533773422241211, + "num_tokens": 676763719.0, + "step": 17734 + }, + { + "epoch": 2.256074290802697, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9329816102981567, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8660394549369812, + "num_tokens": 676801083.0, + "step": 17735 + }, + { + "epoch": 2.2562015010812875, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9833855628967285, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8573054671287537, + "num_tokens": 676840861.0, + "step": 17736 + }, + { + "epoch": 2.256328711359878, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8622503280639648, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8780298233032227, + "num_tokens": 676876864.0, + "step": 17737 + }, + { + "epoch": 2.2564559216384685, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7892712354660034, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8840290904045105, + "num_tokens": 676920983.0, + "step": 17738 + }, + { + "epoch": 2.256583131917059, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8543617725372314, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8746069669723511, + "num_tokens": 676961294.0, + "step": 17739 + }, + { + "epoch": 2.2567103421956496, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8727072477340698, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.857890248298645, + "num_tokens": 676996318.0, + "step": 17740 + }, + { + "epoch": 2.2568375524742397, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9293266534805298, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8669408559799194, + "num_tokens": 677034083.0, + "step": 17741 + }, + { + "epoch": 2.2569647627528306, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9403212070465088, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.865939199924469, + "num_tokens": 677071206.0, + "step": 17742 + }, + { + "epoch": 2.2570919730314207, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0979011058807373, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8690189123153687, + "num_tokens": 677103573.0, + "step": 17743 + }, + { + "epoch": 2.2572191833100113, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8571239709854126, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8745251297950745, + "num_tokens": 677141406.0, + "step": 17744 + }, + { + "epoch": 2.257346393588602, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.827612280845642, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8546147346496582, + "num_tokens": 677185968.0, + "step": 17745 + }, + { + "epoch": 2.2574736038671923, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8131777048110962, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8567498922348022, + "num_tokens": 677229004.0, + "step": 17746 + }, + { + "epoch": 2.257600814145783, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7029496431350708, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8764886856079102, + "num_tokens": 677266317.0, + "step": 17747 + }, + { + "epoch": 2.2577280244243734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8484684228897095, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8747599124908447, + "num_tokens": 677302344.0, + "step": 17748 + }, + { + "epoch": 2.257855234702964, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3284199237823486, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8563883304595947, + "num_tokens": 677342103.0, + "step": 17749 + }, + { + "epoch": 2.2579824449815544, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0272769927978516, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8512111902236938, + "num_tokens": 677378602.0, + "step": 17750 + }, + { + "epoch": 2.258109655260145, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1827597618103027, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8643997311592102, + "num_tokens": 677415005.0, + "step": 17751 + }, + { + "epoch": 2.2582368655387355, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.016106605529785, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8462303280830383, + "num_tokens": 677457908.0, + "step": 17752 + }, + { + "epoch": 2.258364075817326, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0817105770111084, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8539638519287109, + "num_tokens": 677492759.0, + "step": 17753 + }, + { + "epoch": 2.2584912860959165, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8201571702957153, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8627070188522339, + "num_tokens": 677535771.0, + "step": 17754 + }, + { + "epoch": 2.258618496374507, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.803184151649475, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8778058290481567, + "num_tokens": 677575325.0, + "step": 17755 + }, + { + "epoch": 2.2587457066530976, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8423995971679688, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8651889562606812, + "num_tokens": 677614132.0, + "step": 17756 + }, + { + "epoch": 2.258872916931688, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1292994022369385, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8591663241386414, + "num_tokens": 677645657.0, + "step": 17757 + }, + { + "epoch": 2.2590001272102787, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.866302490234375, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8736340403556824, + "num_tokens": 677686238.0, + "step": 17758 + }, + { + "epoch": 2.259127337488869, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8849228620529175, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8684980869293213, + "num_tokens": 677724080.0, + "step": 17759 + }, + { + "epoch": 2.2592545477674597, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8080618381500244, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8647233247756958, + "num_tokens": 677765768.0, + "step": 17760 + }, + { + "epoch": 2.2593817580460502, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0706946849823, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8654985427856445, + "num_tokens": 677803937.0, + "step": 17761 + }, + { + "epoch": 2.2595089683246408, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9106365442276, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8761428594589233, + "num_tokens": 677839685.0, + "step": 17762 + }, + { + "epoch": 2.2596361786032313, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.360354423522949, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8506906032562256, + "num_tokens": 677880411.0, + "step": 17763 + }, + { + "epoch": 2.259763388881822, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8179702758789062, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8735380172729492, + "num_tokens": 677921177.0, + "step": 17764 + }, + { + "epoch": 2.2598905991604123, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0145280361175537, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8738370537757874, + "num_tokens": 677956393.0, + "step": 17765 + }, + { + "epoch": 2.2600178094390024, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.156128406524658, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8640958666801453, + "num_tokens": 677988364.0, + "step": 17766 + }, + { + "epoch": 2.2601450197175934, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8670294284820557, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8727990388870239, + "num_tokens": 678028093.0, + "step": 17767 + }, + { + "epoch": 2.2602722299961835, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9641999006271362, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8669556975364685, + "num_tokens": 678068174.0, + "step": 17768 + }, + { + "epoch": 2.260399440274774, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9012211561203003, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8671067953109741, + "num_tokens": 678105784.0, + "step": 17769 + }, + { + "epoch": 2.2605266505533645, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.846941351890564, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8764520287513733, + "num_tokens": 678142873.0, + "step": 17770 + }, + { + "epoch": 2.260653860831955, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8839901685714722, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.86208176612854, + "num_tokens": 678181527.0, + "step": 17771 + }, + { + "epoch": 2.2607810711105456, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8244662284851074, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8670585751533508, + "num_tokens": 678221225.0, + "step": 17772 + }, + { + "epoch": 2.260908281389136, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9073237180709839, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8741864562034607, + "num_tokens": 678254767.0, + "step": 17773 + }, + { + "epoch": 2.2610354916677267, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.930620551109314, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8669430613517761, + "num_tokens": 678290830.0, + "step": 17774 + }, + { + "epoch": 2.261162701946317, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9324673414230347, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8750422596931458, + "num_tokens": 678327102.0, + "step": 17775 + }, + { + "epoch": 2.2612899122249077, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8998398780822754, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8785752058029175, + "num_tokens": 678364872.0, + "step": 17776 + }, + { + "epoch": 2.2614171225034982, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.154564380645752, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.858741819858551, + "num_tokens": 678399233.0, + "step": 17777 + }, + { + "epoch": 2.2615443327820888, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.044264316558838, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8736777305603027, + "num_tokens": 678434577.0, + "step": 17778 + }, + { + "epoch": 2.2616715430606793, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.188080310821533, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8713982105255127, + "num_tokens": 678466835.0, + "step": 17779 + }, + { + "epoch": 2.26179875333927, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8334792852401733, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8813502788543701, + "num_tokens": 678506817.0, + "step": 17780 + }, + { + "epoch": 2.2619259636178604, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8775033950805664, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.866227388381958, + "num_tokens": 678543734.0, + "step": 17781 + }, + { + "epoch": 2.262053173896451, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.940470576286316, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8607861995697021, + "num_tokens": 678585066.0, + "step": 17782 + }, + { + "epoch": 2.2621803841750414, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.066225528717041, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8815507888793945, + "num_tokens": 678614951.0, + "step": 17783 + }, + { + "epoch": 2.262307594453632, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.063577890396118, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8686452507972717, + "num_tokens": 678649671.0, + "step": 17784 + }, + { + "epoch": 2.2624348047322225, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6144695281982422, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8826683759689331, + "num_tokens": 678693418.0, + "step": 17785 + }, + { + "epoch": 2.262562015010813, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8223068714141846, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8761482238769531, + "num_tokens": 678730891.0, + "step": 17786 + }, + { + "epoch": 2.2626892252894035, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9084409475326538, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8738802075386047, + "num_tokens": 678766369.0, + "step": 17787 + }, + { + "epoch": 2.262816435567994, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7965222597122192, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.856020987033844, + "num_tokens": 678809118.0, + "step": 17788 + }, + { + "epoch": 2.2629436458465846, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8857660293579102, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8648495078086853, + "num_tokens": 678850057.0, + "step": 17789 + }, + { + "epoch": 2.263070856125175, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7996997833251953, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8654454350471497, + "num_tokens": 678890253.0, + "step": 17790 + }, + { + "epoch": 2.263198066403765, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1113064289093018, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8805923461914062, + "num_tokens": 678922535.0, + "step": 17791 + }, + { + "epoch": 2.263325276682356, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4629266262054443, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8648328185081482, + "num_tokens": 678956136.0, + "step": 17792 + }, + { + "epoch": 2.2634524869609463, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.131758689880371, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.868136465549469, + "num_tokens": 678990864.0, + "step": 17793 + }, + { + "epoch": 2.263579697239537, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8375831842422485, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.878153920173645, + "num_tokens": 679028345.0, + "step": 17794 + }, + { + "epoch": 2.2637069075181273, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8363628387451172, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8529666662216187, + "num_tokens": 679073679.0, + "step": 17795 + }, + { + "epoch": 2.263834117796718, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8190547227859497, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.864557683467865, + "num_tokens": 679109274.0, + "step": 17796 + }, + { + "epoch": 2.2639613280753084, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1206576824188232, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8669188618659973, + "num_tokens": 679141644.0, + "step": 17797 + }, + { + "epoch": 2.264088538353899, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8343737125396729, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.849878191947937, + "num_tokens": 679184351.0, + "step": 17798 + }, + { + "epoch": 2.2642157486324894, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9809387922286987, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8634636998176575, + "num_tokens": 679222509.0, + "step": 17799 + }, + { + "epoch": 2.26434295891108, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0861666202545166, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8496817350387573, + "num_tokens": 679263362.0, + "step": 17800 + }, + { + "epoch": 2.2644701691896705, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8038864135742188, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8536364436149597, + "num_tokens": 679307859.0, + "step": 17801 + }, + { + "epoch": 2.264597379468261, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9298871755599976, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8760824203491211, + "num_tokens": 679344801.0, + "step": 17802 + }, + { + "epoch": 2.2647245897468515, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.051964044570923, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8584824800491333, + "num_tokens": 679382795.0, + "step": 17803 + }, + { + "epoch": 2.264851800025442, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3676199913024902, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8630306720733643, + "num_tokens": 679425282.0, + "step": 17804 + }, + { + "epoch": 2.2649790103040326, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9161763191223145, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8531182408332825, + "num_tokens": 679466881.0, + "step": 17805 + }, + { + "epoch": 2.265106220582623, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7022095918655396, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8779026865959167, + "num_tokens": 679509268.0, + "step": 17806 + }, + { + "epoch": 2.2652334308612136, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9774677753448486, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8631646633148193, + "num_tokens": 679545082.0, + "step": 17807 + }, + { + "epoch": 2.265360641139804, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0025041103363037, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8670128583908081, + "num_tokens": 679581795.0, + "step": 17808 + }, + { + "epoch": 2.2654878514183947, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.072876453399658, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8633313179016113, + "num_tokens": 679618978.0, + "step": 17809 + }, + { + "epoch": 2.2656150616969852, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.254241704940796, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8712640404701233, + "num_tokens": 679655002.0, + "step": 17810 + }, + { + "epoch": 2.2657422719755758, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7837740182876587, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8818881511688232, + "num_tokens": 679691143.0, + "step": 17811 + }, + { + "epoch": 2.2658694822541663, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1056272983551025, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8680391907691956, + "num_tokens": 679729168.0, + "step": 17812 + }, + { + "epoch": 2.265996692532757, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 7.762422561645508, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.864859938621521, + "num_tokens": 679773385.0, + "step": 17813 + }, + { + "epoch": 2.2661239028113473, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9926130771636963, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8768690228462219, + "num_tokens": 679813884.0, + "step": 17814 + }, + { + "epoch": 2.266251113089938, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9768048524856567, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8673930764198303, + "num_tokens": 679850834.0, + "step": 17815 + }, + { + "epoch": 2.266378323368528, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9251298904418945, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8715914487838745, + "num_tokens": 679890210.0, + "step": 17816 + }, + { + "epoch": 2.266505533647119, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.979146122932434, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8750995397567749, + "num_tokens": 679926252.0, + "step": 17817 + }, + { + "epoch": 2.266632743925709, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9583384990692139, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8642760515213013, + "num_tokens": 679959626.0, + "step": 17818 + }, + { + "epoch": 2.2667599542042995, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8593578338623047, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.861747682094574, + "num_tokens": 679999973.0, + "step": 17819 + }, + { + "epoch": 2.26688716448289, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9841575622558594, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8594131469726562, + "num_tokens": 680041081.0, + "step": 17820 + }, + { + "epoch": 2.2670143747614806, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.936191201210022, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8701661825180054, + "num_tokens": 680078953.0, + "step": 17821 + }, + { + "epoch": 2.267141585040071, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9241243600845337, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8753515481948853, + "num_tokens": 680112309.0, + "step": 17822 + }, + { + "epoch": 2.2672687953186617, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2054147720336914, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.861753523349762, + "num_tokens": 680148607.0, + "step": 17823 + }, + { + "epoch": 2.267396005597252, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7511712312698364, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8847255110740662, + "num_tokens": 680189617.0, + "step": 17824 + }, + { + "epoch": 2.2675232158758427, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7886234521865845, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8832365274429321, + "num_tokens": 680227940.0, + "step": 17825 + }, + { + "epoch": 2.2676504261544332, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9000484943389893, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8791624307632446, + "num_tokens": 680263473.0, + "step": 17826 + }, + { + "epoch": 2.2677776364330238, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.123652219772339, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8595514893531799, + "num_tokens": 680297895.0, + "step": 17827 + }, + { + "epoch": 2.2679048467116143, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9149580001831055, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8682422637939453, + "num_tokens": 680337658.0, + "step": 17828 + }, + { + "epoch": 2.268032056990205, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9304859638214111, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.861874520778656, + "num_tokens": 680375912.0, + "step": 17829 + }, + { + "epoch": 2.2681592672687954, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3884637355804443, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8772175312042236, + "num_tokens": 680415231.0, + "step": 17830 + }, + { + "epoch": 2.268286477547386, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8178248405456543, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.862305760383606, + "num_tokens": 680460037.0, + "step": 17831 + }, + { + "epoch": 2.2684136878259764, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.7568631172180176, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8433000445365906, + "num_tokens": 680505521.0, + "step": 17832 + }, + { + "epoch": 2.268540898104567, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7847390174865723, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8805468082427979, + "num_tokens": 680547682.0, + "step": 17833 + }, + { + "epoch": 2.2686681083831575, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.876574993133545, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8712068796157837, + "num_tokens": 680587510.0, + "step": 17834 + }, + { + "epoch": 2.268795318661748, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.862930178642273, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8687260150909424, + "num_tokens": 680624233.0, + "step": 17835 + }, + { + "epoch": 2.2689225289403385, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8276350498199463, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.861919641494751, + "num_tokens": 680666091.0, + "step": 17836 + }, + { + "epoch": 2.269049739218929, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9017765522003174, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8743210434913635, + "num_tokens": 680703062.0, + "step": 17837 + }, + { + "epoch": 2.2691769494975196, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.063908100128174, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.88041090965271, + "num_tokens": 680735966.0, + "step": 17838 + }, + { + "epoch": 2.2693041597761097, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7746608257293701, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8666824102401733, + "num_tokens": 680780010.0, + "step": 17839 + }, + { + "epoch": 2.2694313700547006, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0397682189941406, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8661187887191772, + "num_tokens": 680819516.0, + "step": 17840 + }, + { + "epoch": 2.2695585803332907, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8662285804748535, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8644652366638184, + "num_tokens": 680861374.0, + "step": 17841 + }, + { + "epoch": 2.2696857906118812, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.971329689025879, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8583218455314636, + "num_tokens": 680902447.0, + "step": 17842 + }, + { + "epoch": 2.2698130008904718, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7850615978240967, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8837813138961792, + "num_tokens": 680942376.0, + "step": 17843 + }, + { + "epoch": 2.2699402111690623, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.919838786125183, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.866877555847168, + "num_tokens": 680977060.0, + "step": 17844 + }, + { + "epoch": 2.270067421447653, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.070366621017456, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8498094081878662, + "num_tokens": 681014504.0, + "step": 17845 + }, + { + "epoch": 2.2701946317262434, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.8096723556518555, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8508569002151489, + "num_tokens": 681053919.0, + "step": 17846 + }, + { + "epoch": 2.270321842004834, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.070497512817383, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8661277294158936, + "num_tokens": 681090883.0, + "step": 17847 + }, + { + "epoch": 2.2704490522834244, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8699028491973877, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8812447786331177, + "num_tokens": 681128819.0, + "step": 17848 + }, + { + "epoch": 2.270576262562015, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.140528440475464, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8560287952423096, + "num_tokens": 681171434.0, + "step": 17849 + }, + { + "epoch": 2.2707034728406055, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8744839429855347, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8763411045074463, + "num_tokens": 681206297.0, + "step": 17850 + }, + { + "epoch": 2.270830683119196, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9845839738845825, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8562320470809937, + "num_tokens": 681242521.0, + "step": 17851 + }, + { + "epoch": 2.2709578933977865, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8409219980239868, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.876737117767334, + "num_tokens": 681282474.0, + "step": 17852 + }, + { + "epoch": 2.271085103676377, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0441901683807373, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8629708886146545, + "num_tokens": 681317280.0, + "step": 17853 + }, + { + "epoch": 2.2712123139549676, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0883634090423584, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8643273711204529, + "num_tokens": 681349155.0, + "step": 17854 + }, + { + "epoch": 2.271339524233558, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.162801742553711, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8656255006790161, + "num_tokens": 681380863.0, + "step": 17855 + }, + { + "epoch": 2.2714667345121486, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8650798797607422, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8703004121780396, + "num_tokens": 681421970.0, + "step": 17856 + }, + { + "epoch": 2.271593944790739, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8514893054962158, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8807487487792969, + "num_tokens": 681453511.0, + "step": 17857 + }, + { + "epoch": 2.2717211550693297, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 7.803545951843262, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8541736602783203, + "num_tokens": 681496293.0, + "step": 17858 + }, + { + "epoch": 2.2718483653479202, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.212892532348633, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8593968749046326, + "num_tokens": 681537074.0, + "step": 17859 + }, + { + "epoch": 2.2719755756265108, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9883904457092285, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8601566553115845, + "num_tokens": 681578911.0, + "step": 17860 + }, + { + "epoch": 2.2721027859051013, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8350088596343994, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.86428302526474, + "num_tokens": 681622134.0, + "step": 17861 + }, + { + "epoch": 2.272229996183692, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4265527725219727, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8673042058944702, + "num_tokens": 681655919.0, + "step": 17862 + }, + { + "epoch": 2.2723572064622823, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9317454099655151, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8578124046325684, + "num_tokens": 681696539.0, + "step": 17863 + }, + { + "epoch": 2.2724844167408724, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.994642734527588, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.863525390625, + "num_tokens": 681734630.0, + "step": 17864 + }, + { + "epoch": 2.2726116270194634, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.881513237953186, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8547194004058838, + "num_tokens": 681779669.0, + "step": 17865 + }, + { + "epoch": 2.2727388372980535, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9532432556152344, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8645825386047363, + "num_tokens": 681821920.0, + "step": 17866 + }, + { + "epoch": 2.272866047576644, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0285542011260986, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8645485043525696, + "num_tokens": 681859239.0, + "step": 17867 + }, + { + "epoch": 2.2729932578552345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0353269577026367, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8535563945770264, + "num_tokens": 681896569.0, + "step": 17868 + }, + { + "epoch": 2.273120468133825, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.972593069076538, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8811376094818115, + "num_tokens": 681932549.0, + "step": 17869 + }, + { + "epoch": 2.2732476784124156, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8283430337905884, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8625800609588623, + "num_tokens": 681973981.0, + "step": 17870 + }, + { + "epoch": 2.273374888691006, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0909411907196045, + "learning_rate": 1e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8415460586547852, + "num_tokens": 682009512.0, + "step": 17871 + }, + { + "epoch": 2.2735020989695967, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.157895565032959, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8626425266265869, + "num_tokens": 682048058.0, + "step": 17872 + }, + { + "epoch": 2.273629309248187, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.050471305847168, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.85868239402771, + "num_tokens": 682084181.0, + "step": 17873 + }, + { + "epoch": 2.2737565195267777, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8650554418563843, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8863718509674072, + "num_tokens": 682119613.0, + "step": 17874 + }, + { + "epoch": 2.2738837298053682, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.908801555633545, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8683304786682129, + "num_tokens": 682159501.0, + "step": 17875 + }, + { + "epoch": 2.2740109400839588, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8857347965240479, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8750187158584595, + "num_tokens": 682197323.0, + "step": 17876 + }, + { + "epoch": 2.2741381503625493, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0686819553375244, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8675050735473633, + "num_tokens": 682231150.0, + "step": 17877 + }, + { + "epoch": 2.27426536064114, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.918945550918579, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8674583435058594, + "num_tokens": 682269351.0, + "step": 17878 + }, + { + "epoch": 2.2743925709197303, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.977311372756958, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8765681385993958, + "num_tokens": 682303633.0, + "step": 17879 + }, + { + "epoch": 2.274519781198321, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.128014087677002, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8646214008331299, + "num_tokens": 682340890.0, + "step": 17880 + }, + { + "epoch": 2.2746469914769114, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.26180100440979, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8818891644477844, + "num_tokens": 682374253.0, + "step": 17881 + }, + { + "epoch": 2.274774201755502, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9483333826065063, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8665375709533691, + "num_tokens": 682412675.0, + "step": 17882 + }, + { + "epoch": 2.2749014120340925, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.369077682495117, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8765448331832886, + "num_tokens": 682447864.0, + "step": 17883 + }, + { + "epoch": 2.275028622312683, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9048075675964355, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8676427006721497, + "num_tokens": 682486445.0, + "step": 17884 + }, + { + "epoch": 2.2751558325912735, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.940711498260498, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8655712008476257, + "num_tokens": 682518173.0, + "step": 17885 + }, + { + "epoch": 2.275283042869864, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.881363034248352, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8657607436180115, + "num_tokens": 682558627.0, + "step": 17886 + }, + { + "epoch": 2.2754102531484546, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7360213994979858, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8719877004623413, + "num_tokens": 682600821.0, + "step": 17887 + }, + { + "epoch": 2.275537463427045, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7497466802597046, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8548034429550171, + "num_tokens": 682643947.0, + "step": 17888 + }, + { + "epoch": 2.275664673705635, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8628382682800293, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8608706593513489, + "num_tokens": 682683045.0, + "step": 17889 + }, + { + "epoch": 2.275791883984226, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8046436309814453, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8657060861587524, + "num_tokens": 682721617.0, + "step": 17890 + }, + { + "epoch": 2.2759190942628162, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8698745965957642, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8604490756988525, + "num_tokens": 682758339.0, + "step": 17891 + }, + { + "epoch": 2.2760463045414068, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8861746788024902, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8712714910507202, + "num_tokens": 682800501.0, + "step": 17892 + }, + { + "epoch": 2.2761735148199973, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9833259582519531, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8650022745132446, + "num_tokens": 682833231.0, + "step": 17893 + }, + { + "epoch": 2.276300725098588, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8575128316879272, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8712834119796753, + "num_tokens": 682873945.0, + "step": 17894 + }, + { + "epoch": 2.2764279353771784, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7650212049484253, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8647782206535339, + "num_tokens": 682918329.0, + "step": 17895 + }, + { + "epoch": 2.276555145655769, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9068574905395508, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8736783862113953, + "num_tokens": 682956277.0, + "step": 17896 + }, + { + "epoch": 2.2766823559343594, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8697525262832642, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8699662685394287, + "num_tokens": 682990648.0, + "step": 17897 + }, + { + "epoch": 2.27680956621295, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8910948038101196, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8798144459724426, + "num_tokens": 683029046.0, + "step": 17898 + }, + { + "epoch": 2.2769367764915405, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9409362077713013, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.865636944770813, + "num_tokens": 683064541.0, + "step": 17899 + }, + { + "epoch": 2.277063986770131, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9111826419830322, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8659950494766235, + "num_tokens": 683105015.0, + "step": 17900 + }, + { + "epoch": 2.2771911970487215, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8788114786148071, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8589978814125061, + "num_tokens": 683142938.0, + "step": 17901 + }, + { + "epoch": 2.277318407327312, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8132084608078003, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.880239725112915, + "num_tokens": 683181593.0, + "step": 17902 + }, + { + "epoch": 2.2774456176059026, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9740524291992188, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8547844886779785, + "num_tokens": 683218641.0, + "step": 17903 + }, + { + "epoch": 2.277572827884493, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9033277034759521, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8503315448760986, + "num_tokens": 683261479.0, + "step": 17904 + }, + { + "epoch": 2.2777000381630836, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9974679946899414, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8402600288391113, + "num_tokens": 683300602.0, + "step": 17905 + }, + { + "epoch": 2.277827248441674, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.990820288658142, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8650065660476685, + "num_tokens": 683337012.0, + "step": 17906 + }, + { + "epoch": 2.2779544587202647, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0655534267425537, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8736543655395508, + "num_tokens": 683368949.0, + "step": 17907 + }, + { + "epoch": 2.2780816689988552, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9348790645599365, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8639355301856995, + "num_tokens": 683408741.0, + "step": 17908 + }, + { + "epoch": 2.2782088792774458, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.064918279647827, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8571804761886597, + "num_tokens": 683447843.0, + "step": 17909 + }, + { + "epoch": 2.2783360895560363, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0816006660461426, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8631664514541626, + "num_tokens": 683489695.0, + "step": 17910 + }, + { + "epoch": 2.278463299834627, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0381720066070557, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8634509444236755, + "num_tokens": 683528381.0, + "step": 17911 + }, + { + "epoch": 2.2785905101132173, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8321563005447388, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8679602146148682, + "num_tokens": 683567261.0, + "step": 17912 + }, + { + "epoch": 2.278717720391808, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0324270725250244, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8519120216369629, + "num_tokens": 683601083.0, + "step": 17913 + }, + { + "epoch": 2.278844930670398, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9702684879302979, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8590916395187378, + "num_tokens": 683643494.0, + "step": 17914 + }, + { + "epoch": 2.278972140948989, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8762396574020386, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8612251281738281, + "num_tokens": 683684267.0, + "step": 17915 + }, + { + "epoch": 2.279099351227579, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8595730066299438, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8550446629524231, + "num_tokens": 683723292.0, + "step": 17916 + }, + { + "epoch": 2.2792265615061695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8539438247680664, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8782455921173096, + "num_tokens": 683761378.0, + "step": 17917 + }, + { + "epoch": 2.27935377178476, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.435826301574707, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8569750785827637, + "num_tokens": 683794838.0, + "step": 17918 + }, + { + "epoch": 2.2794809820633506, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0113813877105713, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8485286831855774, + "num_tokens": 683832307.0, + "step": 17919 + }, + { + "epoch": 2.279608192341941, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0936481952667236, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8590449690818787, + "num_tokens": 683871848.0, + "step": 17920 + }, + { + "epoch": 2.2797354026205316, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7979626655578613, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8695393800735474, + "num_tokens": 683915086.0, + "step": 17921 + }, + { + "epoch": 2.279862612899122, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0248842239379883, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8583106398582458, + "num_tokens": 683945411.0, + "step": 17922 + }, + { + "epoch": 2.2799898231777127, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9845106601715088, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8648474812507629, + "num_tokens": 683979664.0, + "step": 17923 + }, + { + "epoch": 2.2801170334563032, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9447975158691406, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8562988042831421, + "num_tokens": 684025526.0, + "step": 17924 + }, + { + "epoch": 2.2802442437348938, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.276679039001465, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8591704368591309, + "num_tokens": 684056992.0, + "step": 17925 + }, + { + "epoch": 2.2803714540134843, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.994279384613037, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8697988986968994, + "num_tokens": 684094515.0, + "step": 17926 + }, + { + "epoch": 2.280498664292075, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.926628828048706, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8726940751075745, + "num_tokens": 684132294.0, + "step": 17927 + }, + { + "epoch": 2.2806258745706653, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0118515491485596, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8696551322937012, + "num_tokens": 684164010.0, + "step": 17928 + }, + { + "epoch": 2.280753084849256, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8732521533966064, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8751540184020996, + "num_tokens": 684202077.0, + "step": 17929 + }, + { + "epoch": 2.2808802951278464, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.995345950126648, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8397144079208374, + "num_tokens": 684240799.0, + "step": 17930 + }, + { + "epoch": 2.281007505406437, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7328331470489502, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8637651205062866, + "num_tokens": 684284080.0, + "step": 17931 + }, + { + "epoch": 2.2811347156850275, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.144359827041626, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8493216037750244, + "num_tokens": 684317915.0, + "step": 17932 + }, + { + "epoch": 2.281261925963618, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8095570802688599, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.9035739302635193, + "num_tokens": 684354665.0, + "step": 17933 + }, + { + "epoch": 2.2813891362422085, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.908111333847046, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8651530146598816, + "num_tokens": 684392866.0, + "step": 17934 + }, + { + "epoch": 2.281516346520799, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.597890853881836, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8689253330230713, + "num_tokens": 684428422.0, + "step": 17935 + }, + { + "epoch": 2.2816435567993896, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.088704824447632, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8525092601776123, + "num_tokens": 684467129.0, + "step": 17936 + }, + { + "epoch": 2.2817707670779797, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0576016902923584, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8575378656387329, + "num_tokens": 684500093.0, + "step": 17937 + }, + { + "epoch": 2.2818979773565706, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8958520889282227, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8568963408470154, + "num_tokens": 684541422.0, + "step": 17938 + }, + { + "epoch": 2.2820251876351607, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8482531309127808, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8618953227996826, + "num_tokens": 684579164.0, + "step": 17939 + }, + { + "epoch": 2.2821523979137512, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0104875564575195, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8491001725196838, + "num_tokens": 684618617.0, + "step": 17940 + }, + { + "epoch": 2.2822796081923418, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8727850914001465, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8507626056671143, + "num_tokens": 684655525.0, + "step": 17941 + }, + { + "epoch": 2.2824068184709323, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2998218536376953, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8651109933853149, + "num_tokens": 684694525.0, + "step": 17942 + }, + { + "epoch": 2.282534028749523, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8156341314315796, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8659616112709045, + "num_tokens": 684733247.0, + "step": 17943 + }, + { + "epoch": 2.2826612390281134, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.908613681793213, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.877225399017334, + "num_tokens": 684763642.0, + "step": 17944 + }, + { + "epoch": 2.282788449306704, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.858336329460144, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8712568283081055, + "num_tokens": 684806164.0, + "step": 17945 + }, + { + "epoch": 2.2829156595852944, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8467459678649902, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8651422262191772, + "num_tokens": 684844193.0, + "step": 17946 + }, + { + "epoch": 2.283042869863885, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5048482418060303, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.871200680732727, + "num_tokens": 684882103.0, + "step": 17947 + }, + { + "epoch": 2.2831700801424755, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9763283729553223, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.879698634147644, + "num_tokens": 684917667.0, + "step": 17948 + }, + { + "epoch": 2.283297290421066, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 80.52635955810547, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8699169754981995, + "num_tokens": 684954007.0, + "step": 17949 + }, + { + "epoch": 2.2834245006996565, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0418572425842285, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8605688214302063, + "num_tokens": 684994693.0, + "step": 17950 + }, + { + "epoch": 2.283551710978247, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9850503206253052, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8537166118621826, + "num_tokens": 685032713.0, + "step": 17951 + }, + { + "epoch": 2.2836789212568376, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0696170330047607, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8592395186424255, + "num_tokens": 685066597.0, + "step": 17952 + }, + { + "epoch": 2.283806131535428, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8541043996810913, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.859209418296814, + "num_tokens": 685106243.0, + "step": 17953 + }, + { + "epoch": 2.2839333418140186, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2132017612457275, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8562802076339722, + "num_tokens": 685145177.0, + "step": 17954 + }, + { + "epoch": 2.284060552092609, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8605672121047974, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8729990124702454, + "num_tokens": 685189130.0, + "step": 17955 + }, + { + "epoch": 2.2841877623711997, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.891555905342102, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8687649965286255, + "num_tokens": 685227536.0, + "step": 17956 + }, + { + "epoch": 2.28431497264979, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3389651775360107, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8705686926841736, + "num_tokens": 685267213.0, + "step": 17957 + }, + { + "epoch": 2.2844421829283807, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9648833274841309, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8631573915481567, + "num_tokens": 685301115.0, + "step": 17958 + }, + { + "epoch": 2.2845693932069713, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8282984495162964, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8713699579238892, + "num_tokens": 685338497.0, + "step": 17959 + }, + { + "epoch": 2.284696603485562, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9045906066894531, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8548978567123413, + "num_tokens": 685374983.0, + "step": 17960 + }, + { + "epoch": 2.2848238137641523, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8785711526870728, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8717683553695679, + "num_tokens": 685408859.0, + "step": 17961 + }, + { + "epoch": 2.2849510240427424, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7963082790374756, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8626893162727356, + "num_tokens": 685445103.0, + "step": 17962 + }, + { + "epoch": 2.2850782343213334, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7528855800628662, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8596241474151611, + "num_tokens": 685484657.0, + "step": 17963 + }, + { + "epoch": 2.2852054445999235, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8264431953430176, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8734272122383118, + "num_tokens": 685524034.0, + "step": 17964 + }, + { + "epoch": 2.285332654878514, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8314650058746338, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8717849254608154, + "num_tokens": 685560368.0, + "step": 17965 + }, + { + "epoch": 2.2854598651571045, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9511384963989258, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.861091136932373, + "num_tokens": 685598192.0, + "step": 17966 + }, + { + "epoch": 2.285587075435695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.387096881866455, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8683521747589111, + "num_tokens": 685636300.0, + "step": 17967 + }, + { + "epoch": 2.2857142857142856, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.966745138168335, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8635679483413696, + "num_tokens": 685671457.0, + "step": 17968 + }, + { + "epoch": 2.285841495992876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9521615505218506, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8537496328353882, + "num_tokens": 685709687.0, + "step": 17969 + }, + { + "epoch": 2.2859687062714666, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8839586973190308, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8437892198562622, + "num_tokens": 685747854.0, + "step": 17970 + }, + { + "epoch": 2.286095916550057, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7876955270767212, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8848564624786377, + "num_tokens": 685785215.0, + "step": 17971 + }, + { + "epoch": 2.2862231268286477, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8627450466156006, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8698931932449341, + "num_tokens": 685818871.0, + "step": 17972 + }, + { + "epoch": 2.2863503371072382, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9746330976486206, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8589025735855103, + "num_tokens": 685859089.0, + "step": 17973 + }, + { + "epoch": 2.2864775473858288, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7935640811920166, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.857326328754425, + "num_tokens": 685900346.0, + "step": 17974 + }, + { + "epoch": 2.2866047576644193, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8971279859542847, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8640574216842651, + "num_tokens": 685938186.0, + "step": 17975 + }, + { + "epoch": 2.28673196794301, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9601248502731323, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8718646764755249, + "num_tokens": 685974911.0, + "step": 17976 + }, + { + "epoch": 2.2868591782216003, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0040316581726074, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8673888444900513, + "num_tokens": 686010430.0, + "step": 17977 + }, + { + "epoch": 2.286986388500191, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.181652784347534, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8758945465087891, + "num_tokens": 686042727.0, + "step": 17978 + }, + { + "epoch": 2.2871135987787814, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0936617851257324, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8669832944869995, + "num_tokens": 686078819.0, + "step": 17979 + }, + { + "epoch": 2.287240809057372, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.002969264984131, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8664231300354004, + "num_tokens": 686117760.0, + "step": 17980 + }, + { + "epoch": 2.2873680193359625, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9683815240859985, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.857000470161438, + "num_tokens": 686152992.0, + "step": 17981 + }, + { + "epoch": 2.287495229614553, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8713023662567139, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8605649471282959, + "num_tokens": 686195353.0, + "step": 17982 + }, + { + "epoch": 2.2876224398931435, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9694297313690186, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8618840575218201, + "num_tokens": 686232990.0, + "step": 17983 + }, + { + "epoch": 2.287749650171734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.979099154472351, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8703920841217041, + "num_tokens": 686265933.0, + "step": 17984 + }, + { + "epoch": 2.2878768604503246, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8996667861938477, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8714994788169861, + "num_tokens": 686307111.0, + "step": 17985 + }, + { + "epoch": 2.288004070728915, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9828847646713257, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8673317432403564, + "num_tokens": 686343363.0, + "step": 17986 + }, + { + "epoch": 2.288131281007505, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9380677938461304, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8642134666442871, + "num_tokens": 686380888.0, + "step": 17987 + }, + { + "epoch": 2.288258491286096, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9043840169906616, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8573379516601562, + "num_tokens": 686417350.0, + "step": 17988 + }, + { + "epoch": 2.2883857015646862, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2769126892089844, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.873633623123169, + "num_tokens": 686458658.0, + "step": 17989 + }, + { + "epoch": 2.2885129118432768, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3564765453338623, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8680965900421143, + "num_tokens": 686493945.0, + "step": 17990 + }, + { + "epoch": 2.2886401221218673, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9152193069458008, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8642070293426514, + "num_tokens": 686527608.0, + "step": 17991 + }, + { + "epoch": 2.288767332400458, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9182054996490479, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8770326972007751, + "num_tokens": 686562272.0, + "step": 17992 + }, + { + "epoch": 2.2888945426790483, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9191009998321533, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8763227462768555, + "num_tokens": 686599937.0, + "step": 17993 + }, + { + "epoch": 2.289021752957639, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0926008224487305, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8662370443344116, + "num_tokens": 686632103.0, + "step": 17994 + }, + { + "epoch": 2.2891489632362294, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.7918076515197754, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8730812668800354, + "num_tokens": 686662802.0, + "step": 17995 + }, + { + "epoch": 2.28927617351482, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.002389430999756, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8871701955795288, + "num_tokens": 686699386.0, + "step": 17996 + }, + { + "epoch": 2.2894033837934105, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.142788887023926, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.868857741355896, + "num_tokens": 686738369.0, + "step": 17997 + }, + { + "epoch": 2.289530594072001, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.987284779548645, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8587738275527954, + "num_tokens": 686774543.0, + "step": 17998 + }, + { + "epoch": 2.2896578043505915, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.026312828063965, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8570437431335449, + "num_tokens": 686807048.0, + "step": 17999 + }, + { + "epoch": 2.289785014629182, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8978296518325806, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8664107918739319, + "num_tokens": 686844678.0, + "step": 18000 + }, + { + "epoch": 2.2899122249077726, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9429582357406616, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8657931089401245, + "num_tokens": 686877721.0, + "step": 18001 + }, + { + "epoch": 2.290039435186363, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0588839054107666, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.856330394744873, + "num_tokens": 686918329.0, + "step": 18002 + }, + { + "epoch": 2.2901666454649536, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9242438077926636, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8604727983474731, + "num_tokens": 686960050.0, + "step": 18003 + }, + { + "epoch": 2.290293855743544, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.802024006843567, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8752144575119019, + "num_tokens": 686997849.0, + "step": 18004 + }, + { + "epoch": 2.2904210660221347, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9095532894134521, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8696226477622986, + "num_tokens": 687032363.0, + "step": 18005 + }, + { + "epoch": 2.290548276300725, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.959283471107483, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8498964309692383, + "num_tokens": 687069606.0, + "step": 18006 + }, + { + "epoch": 2.2906754865793157, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9066522121429443, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8558125495910645, + "num_tokens": 687112857.0, + "step": 18007 + }, + { + "epoch": 2.2908026968579063, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.85270357131958, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8631649017333984, + "num_tokens": 687152245.0, + "step": 18008 + }, + { + "epoch": 2.290929907136497, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7966278791427612, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8646528720855713, + "num_tokens": 687193988.0, + "step": 18009 + }, + { + "epoch": 2.2910571174150873, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8683496713638306, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8601288795471191, + "num_tokens": 687232375.0, + "step": 18010 + }, + { + "epoch": 2.291184327693678, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8472980260849, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8696975708007812, + "num_tokens": 687274566.0, + "step": 18011 + }, + { + "epoch": 2.291311537972268, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.392385482788086, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8643988370895386, + "num_tokens": 687313231.0, + "step": 18012 + }, + { + "epoch": 2.291438748250859, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8906748294830322, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8708393573760986, + "num_tokens": 687354502.0, + "step": 18013 + }, + { + "epoch": 2.291565958529449, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8126327991485596, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8622561693191528, + "num_tokens": 687399160.0, + "step": 18014 + }, + { + "epoch": 2.2916931688080395, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7864910364151, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8554050922393799, + "num_tokens": 687438775.0, + "step": 18015 + }, + { + "epoch": 2.29182037908663, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.016070604324341, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8709635138511658, + "num_tokens": 687471435.0, + "step": 18016 + }, + { + "epoch": 2.2919475893652206, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8166686296463013, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8665505647659302, + "num_tokens": 687508657.0, + "step": 18017 + }, + { + "epoch": 2.292074799643811, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.031355619430542, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8755151033401489, + "num_tokens": 687545024.0, + "step": 18018 + }, + { + "epoch": 2.2922020099224016, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0214450359344482, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8691978454589844, + "num_tokens": 687579432.0, + "step": 18019 + }, + { + "epoch": 2.292329220200992, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9852124452590942, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8557338714599609, + "num_tokens": 687618378.0, + "step": 18020 + }, + { + "epoch": 2.2924564304795827, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 4.825920104980469, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8630239963531494, + "num_tokens": 687657119.0, + "step": 18021 + }, + { + "epoch": 2.2925836407581732, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9971332550048828, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8549201488494873, + "num_tokens": 687691924.0, + "step": 18022 + }, + { + "epoch": 2.2927108510367638, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.16646409034729, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8424220681190491, + "num_tokens": 687724221.0, + "step": 18023 + }, + { + "epoch": 2.2928380613153543, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.60664176940918, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8663558959960938, + "num_tokens": 687765049.0, + "step": 18024 + }, + { + "epoch": 2.292965271593945, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9977625608444214, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8548983931541443, + "num_tokens": 687810222.0, + "step": 18025 + }, + { + "epoch": 2.2930924818725353, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9932256937026978, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8587071299552917, + "num_tokens": 687851961.0, + "step": 18026 + }, + { + "epoch": 2.293219692151126, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9280327558517456, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8549166917800903, + "num_tokens": 687890448.0, + "step": 18027 + }, + { + "epoch": 2.2933469024297164, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.775344729423523, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8739818334579468, + "num_tokens": 687929509.0, + "step": 18028 + }, + { + "epoch": 2.293474112708307, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9425166845321655, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8553510308265686, + "num_tokens": 687966814.0, + "step": 18029 + }, + { + "epoch": 2.2936013229868975, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.83426833152771, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8723760843276978, + "num_tokens": 688006398.0, + "step": 18030 + }, + { + "epoch": 2.293728533265488, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9153226613998413, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8614459037780762, + "num_tokens": 688044604.0, + "step": 18031 + }, + { + "epoch": 2.2938557435440785, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8366057872772217, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8608649969100952, + "num_tokens": 688082969.0, + "step": 18032 + }, + { + "epoch": 2.293982953822669, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8456575870513916, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8726851940155029, + "num_tokens": 688121442.0, + "step": 18033 + }, + { + "epoch": 2.2941101641012596, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7682806253433228, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8656774759292603, + "num_tokens": 688161096.0, + "step": 18034 + }, + { + "epoch": 2.2942373743798496, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8765997886657715, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8666239976882935, + "num_tokens": 688200782.0, + "step": 18035 + }, + { + "epoch": 2.2943645846584406, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8120964765548706, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8722058534622192, + "num_tokens": 688241333.0, + "step": 18036 + }, + { + "epoch": 2.2944917949370307, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9006717205047607, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8632093667984009, + "num_tokens": 688278297.0, + "step": 18037 + }, + { + "epoch": 2.2946190052156212, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8823707103729248, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8410061597824097, + "num_tokens": 688318835.0, + "step": 18038 + }, + { + "epoch": 2.2947462154942118, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8474478721618652, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8677983283996582, + "num_tokens": 688354814.0, + "step": 18039 + }, + { + "epoch": 2.2948734257728023, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0998125076293945, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8684068918228149, + "num_tokens": 688390008.0, + "step": 18040 + }, + { + "epoch": 2.295000636051393, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.965819001197815, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8667830228805542, + "num_tokens": 688430938.0, + "step": 18041 + }, + { + "epoch": 2.2951278463299833, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.13643479347229, + "learning_rate": 1e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.8350826501846313, + "num_tokens": 688466730.0, + "step": 18042 + }, + { + "epoch": 2.295255056608574, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8373397588729858, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8645191192626953, + "num_tokens": 688506666.0, + "step": 18043 + }, + { + "epoch": 2.2953822668871644, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0071542263031006, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8498831987380981, + "num_tokens": 688547726.0, + "step": 18044 + }, + { + "epoch": 2.295509477165755, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0367443561553955, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8618301153182983, + "num_tokens": 688587487.0, + "step": 18045 + }, + { + "epoch": 2.2956366874443455, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8529636859893799, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8583798408508301, + "num_tokens": 688632454.0, + "step": 18046 + }, + { + "epoch": 2.295763897722936, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8789780139923096, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8722971677780151, + "num_tokens": 688668489.0, + "step": 18047 + }, + { + "epoch": 2.2958911080015265, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8692795038223267, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8681128025054932, + "num_tokens": 688707899.0, + "step": 18048 + }, + { + "epoch": 2.296018318280117, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7818735837936401, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8702686429023743, + "num_tokens": 688747171.0, + "step": 18049 + }, + { + "epoch": 2.2961455285587076, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9861599206924438, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8631339073181152, + "num_tokens": 688789553.0, + "step": 18050 + }, + { + "epoch": 2.296272738837298, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.912350058555603, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8678896427154541, + "num_tokens": 688827632.0, + "step": 18051 + }, + { + "epoch": 2.2963999491158886, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.024571657180786, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8742793798446655, + "num_tokens": 688861505.0, + "step": 18052 + }, + { + "epoch": 2.296527159394479, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9334464073181152, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8576558828353882, + "num_tokens": 688898270.0, + "step": 18053 + }, + { + "epoch": 2.2966543696730697, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8320021629333496, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8661453723907471, + "num_tokens": 688937420.0, + "step": 18054 + }, + { + "epoch": 2.29678157995166, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1343839168548584, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8702543377876282, + "num_tokens": 688972426.0, + "step": 18055 + }, + { + "epoch": 2.2969087902302507, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1358439922332764, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8644247651100159, + "num_tokens": 689005801.0, + "step": 18056 + }, + { + "epoch": 2.2970360005088413, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9432474374771118, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8599657416343689, + "num_tokens": 689039999.0, + "step": 18057 + }, + { + "epoch": 2.297163210787432, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9861855506896973, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8494836091995239, + "num_tokens": 689079404.0, + "step": 18058 + }, + { + "epoch": 2.2972904210660223, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9267261028289795, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8420277833938599, + "num_tokens": 689116197.0, + "step": 18059 + }, + { + "epoch": 2.2974176313446124, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1709821224212646, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8653343319892883, + "num_tokens": 689153446.0, + "step": 18060 + }, + { + "epoch": 2.2975448416232034, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.034255027770996, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.847568154335022, + "num_tokens": 689185310.0, + "step": 18061 + }, + { + "epoch": 2.2976720519017935, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8601665496826172, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8748918175697327, + "num_tokens": 689224150.0, + "step": 18062 + }, + { + "epoch": 2.297799262180384, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9556916952133179, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8561815619468689, + "num_tokens": 689261206.0, + "step": 18063 + }, + { + "epoch": 2.2979264724589745, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9276334047317505, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8702304363250732, + "num_tokens": 689299129.0, + "step": 18064 + }, + { + "epoch": 2.298053682737565, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0974509716033936, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8814029693603516, + "num_tokens": 689330554.0, + "step": 18065 + }, + { + "epoch": 2.2981808930161556, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9465296268463135, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8710493445396423, + "num_tokens": 689364084.0, + "step": 18066 + }, + { + "epoch": 2.298308103294746, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7703241109848022, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8544843196868896, + "num_tokens": 689405254.0, + "step": 18067 + }, + { + "epoch": 2.2984353135733366, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9494082927703857, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8563002943992615, + "num_tokens": 689441109.0, + "step": 18068 + }, + { + "epoch": 2.298562523851927, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9882376194000244, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8625771999359131, + "num_tokens": 689474937.0, + "step": 18069 + }, + { + "epoch": 2.2986897341305177, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7970117330551147, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8597134351730347, + "num_tokens": 689515971.0, + "step": 18070 + }, + { + "epoch": 2.298816944409108, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1098756790161133, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8419229388237, + "num_tokens": 689553620.0, + "step": 18071 + }, + { + "epoch": 2.2989441546876987, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9079995155334473, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8690413236618042, + "num_tokens": 689592157.0, + "step": 18072 + }, + { + "epoch": 2.2990713649662893, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0114076137542725, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8778988122940063, + "num_tokens": 689626889.0, + "step": 18073 + }, + { + "epoch": 2.29919857524488, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.835809588432312, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8763729333877563, + "num_tokens": 689666161.0, + "step": 18074 + }, + { + "epoch": 2.2993257855234703, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.041142225265503, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8563631176948547, + "num_tokens": 689713713.0, + "step": 18075 + }, + { + "epoch": 2.299452995802061, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9292763471603394, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.857427179813385, + "num_tokens": 689755146.0, + "step": 18076 + }, + { + "epoch": 2.2995802060806514, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8224354982376099, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8753542900085449, + "num_tokens": 689795925.0, + "step": 18077 + }, + { + "epoch": 2.299707416359242, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9790958166122437, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8757472038269043, + "num_tokens": 689830673.0, + "step": 18078 + }, + { + "epoch": 2.2998346266378324, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0309231281280518, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8638118505477905, + "num_tokens": 689863649.0, + "step": 18079 + }, + { + "epoch": 2.299961836916423, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.950075387954712, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8640509843826294, + "num_tokens": 689901946.0, + "step": 18080 + }, + { + "epoch": 2.3000890471950135, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.019984006881714, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8587212562561035, + "num_tokens": 689939968.0, + "step": 18081 + }, + { + "epoch": 2.300216257473604, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7529780864715576, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8733712434768677, + "num_tokens": 689979792.0, + "step": 18082 + }, + { + "epoch": 2.3003434677521946, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.840344786643982, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.877451479434967, + "num_tokens": 690013974.0, + "step": 18083 + }, + { + "epoch": 2.300470678030785, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6887155771255493, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8725359439849854, + "num_tokens": 690056540.0, + "step": 18084 + }, + { + "epoch": 2.300597888309375, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7884953022003174, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8582321405410767, + "num_tokens": 690101032.0, + "step": 18085 + }, + { + "epoch": 2.300725098587966, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8030750751495361, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8775110244750977, + "num_tokens": 690135417.0, + "step": 18086 + }, + { + "epoch": 2.3008523088665562, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0735790729522705, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8668303489685059, + "num_tokens": 690178552.0, + "step": 18087 + }, + { + "epoch": 2.3009795191451468, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9017555713653564, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.862787663936615, + "num_tokens": 690221928.0, + "step": 18088 + }, + { + "epoch": 2.3011067294237373, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9699997901916504, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8722271919250488, + "num_tokens": 690257040.0, + "step": 18089 + }, + { + "epoch": 2.301233939702328, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7512445449829102, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.865103006362915, + "num_tokens": 690302720.0, + "step": 18090 + }, + { + "epoch": 2.3013611499809183, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.064978837966919, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8583117723464966, + "num_tokens": 690337042.0, + "step": 18091 + }, + { + "epoch": 2.301488360259509, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.974288821220398, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8713183403015137, + "num_tokens": 690380247.0, + "step": 18092 + }, + { + "epoch": 2.3016155705380994, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7741838693618774, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8679596185684204, + "num_tokens": 690425361.0, + "step": 18093 + }, + { + "epoch": 2.30174278081669, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9304282665252686, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.860399603843689, + "num_tokens": 690465137.0, + "step": 18094 + }, + { + "epoch": 2.3018699910952805, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0201711654663086, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8635757565498352, + "num_tokens": 690501437.0, + "step": 18095 + }, + { + "epoch": 2.301997201373871, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9543813467025757, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8601839542388916, + "num_tokens": 690544134.0, + "step": 18096 + }, + { + "epoch": 2.3021244116524615, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1702513694763184, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8737165927886963, + "num_tokens": 690581062.0, + "step": 18097 + }, + { + "epoch": 2.302251621931052, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9330155849456787, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8701934814453125, + "num_tokens": 690615145.0, + "step": 18098 + }, + { + "epoch": 2.3023788322096426, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8568145036697388, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8709454536437988, + "num_tokens": 690655483.0, + "step": 18099 + }, + { + "epoch": 2.302506042488233, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9021179676055908, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8638036251068115, + "num_tokens": 690694133.0, + "step": 18100 + }, + { + "epoch": 2.3026332527668236, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0104024410247803, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8438562154769897, + "num_tokens": 690729939.0, + "step": 18101 + }, + { + "epoch": 2.302760463045414, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9438791275024414, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.871096134185791, + "num_tokens": 690770139.0, + "step": 18102 + }, + { + "epoch": 2.3028876733240047, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7968162298202515, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8670792579650879, + "num_tokens": 690811379.0, + "step": 18103 + }, + { + "epoch": 2.303014883602595, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9851319789886475, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.861678957939148, + "num_tokens": 690844194.0, + "step": 18104 + }, + { + "epoch": 2.3031420938811857, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8498787879943848, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8592976331710815, + "num_tokens": 690882129.0, + "step": 18105 + }, + { + "epoch": 2.3032693041597763, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8889061212539673, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8781822919845581, + "num_tokens": 690915568.0, + "step": 18106 + }, + { + "epoch": 2.303396514438367, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.950289011001587, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8617401123046875, + "num_tokens": 690953412.0, + "step": 18107 + }, + { + "epoch": 2.3035237247169573, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0022196769714355, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8625226020812988, + "num_tokens": 690990047.0, + "step": 18108 + }, + { + "epoch": 2.303650934995548, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.055081367492676, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8706859350204468, + "num_tokens": 691031010.0, + "step": 18109 + }, + { + "epoch": 2.303778145274138, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.007566213607788, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8666766285896301, + "num_tokens": 691065830.0, + "step": 18110 + }, + { + "epoch": 2.303905355552729, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.900726556777954, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.877284049987793, + "num_tokens": 691101418.0, + "step": 18111 + }, + { + "epoch": 2.304032565831319, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0653581619262695, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8497108817100525, + "num_tokens": 691139447.0, + "step": 18112 + }, + { + "epoch": 2.3041597761099095, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2665855884552, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8449796438217163, + "num_tokens": 691168809.0, + "step": 18113 + }, + { + "epoch": 2.3042869863885, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0224931240081787, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8701702952384949, + "num_tokens": 691200420.0, + "step": 18114 + }, + { + "epoch": 2.3044141966670906, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9020709991455078, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8591480255126953, + "num_tokens": 691238587.0, + "step": 18115 + }, + { + "epoch": 2.304541406945681, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8956530094146729, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8640044927597046, + "num_tokens": 691277137.0, + "step": 18116 + }, + { + "epoch": 2.3046686172242716, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9208472967147827, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8622795343399048, + "num_tokens": 691319564.0, + "step": 18117 + }, + { + "epoch": 2.304795827502862, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9029215574264526, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8631176948547363, + "num_tokens": 691359652.0, + "step": 18118 + }, + { + "epoch": 2.3049230377814527, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8459659814834595, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8650041818618774, + "num_tokens": 691399705.0, + "step": 18119 + }, + { + "epoch": 2.305050248060043, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8926242589950562, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8575607538223267, + "num_tokens": 691439447.0, + "step": 18120 + }, + { + "epoch": 2.3051774583386337, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9719157218933105, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8536747097969055, + "num_tokens": 691477453.0, + "step": 18121 + }, + { + "epoch": 2.3053046686172243, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8565049171447754, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8651057481765747, + "num_tokens": 691517766.0, + "step": 18122 + }, + { + "epoch": 2.305431878895815, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.848541259765625, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8638225793838501, + "num_tokens": 691553956.0, + "step": 18123 + }, + { + "epoch": 2.3055590891744053, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8747572898864746, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8629748225212097, + "num_tokens": 691592522.0, + "step": 18124 + }, + { + "epoch": 2.305686299452996, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5581228733062744, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8650411367416382, + "num_tokens": 691629202.0, + "step": 18125 + }, + { + "epoch": 2.3058135097315864, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9375122785568237, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8676161170005798, + "num_tokens": 691665324.0, + "step": 18126 + }, + { + "epoch": 2.305940720010177, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8151671886444092, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8742626309394836, + "num_tokens": 691704223.0, + "step": 18127 + }, + { + "epoch": 2.3060679302887674, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8240958452224731, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8624967932701111, + "num_tokens": 691745061.0, + "step": 18128 + }, + { + "epoch": 2.306195140567358, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8325080871582031, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8761518597602844, + "num_tokens": 691783630.0, + "step": 18129 + }, + { + "epoch": 2.3063223508459485, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8567301034927368, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8637663125991821, + "num_tokens": 691818283.0, + "step": 18130 + }, + { + "epoch": 2.306449561124539, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9977307319641113, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8535779714584351, + "num_tokens": 691854585.0, + "step": 18131 + }, + { + "epoch": 2.3065767714031296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9790126085281372, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.847103476524353, + "num_tokens": 691891784.0, + "step": 18132 + }, + { + "epoch": 2.3067039816817196, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.863256573677063, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8660764694213867, + "num_tokens": 691927389.0, + "step": 18133 + }, + { + "epoch": 2.3068311919603106, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9121594429016113, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8629533052444458, + "num_tokens": 691962928.0, + "step": 18134 + }, + { + "epoch": 2.3069584022389007, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.784776210784912, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8638147115707397, + "num_tokens": 692007064.0, + "step": 18135 + }, + { + "epoch": 2.3070856125174912, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8813542127609253, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.869531512260437, + "num_tokens": 692040668.0, + "step": 18136 + }, + { + "epoch": 2.3072128227960818, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.956346035003662, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8656260967254639, + "num_tokens": 692077710.0, + "step": 18137 + }, + { + "epoch": 2.3073400330746723, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8797039985656738, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8726496696472168, + "num_tokens": 692112316.0, + "step": 18138 + }, + { + "epoch": 2.307467243353263, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9875544309616089, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.869996190071106, + "num_tokens": 692153201.0, + "step": 18139 + }, + { + "epoch": 2.3075944536318533, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9479382038116455, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8577315807342529, + "num_tokens": 692192942.0, + "step": 18140 + }, + { + "epoch": 2.307721663910444, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8905751705169678, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8793927431106567, + "num_tokens": 692235515.0, + "step": 18141 + }, + { + "epoch": 2.3078488741890344, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.109647512435913, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8549109101295471, + "num_tokens": 692271338.0, + "step": 18142 + }, + { + "epoch": 2.307976084467625, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9445935487747192, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8454777002334595, + "num_tokens": 692311469.0, + "step": 18143 + }, + { + "epoch": 2.3081032947462155, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8878941535949707, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8455685377120972, + "num_tokens": 692352052.0, + "step": 18144 + }, + { + "epoch": 2.308230505024806, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.008071184158325, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8486731052398682, + "num_tokens": 692382978.0, + "step": 18145 + }, + { + "epoch": 2.3083577153033965, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9764825105667114, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8643940687179565, + "num_tokens": 692415651.0, + "step": 18146 + }, + { + "epoch": 2.308484925581987, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.902301788330078, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8467868566513062, + "num_tokens": 692455937.0, + "step": 18147 + }, + { + "epoch": 2.3086121358605776, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.5928852558135986, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.859926700592041, + "num_tokens": 692496277.0, + "step": 18148 + }, + { + "epoch": 2.308739346139168, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0866358280181885, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8645570278167725, + "num_tokens": 692530485.0, + "step": 18149 + }, + { + "epoch": 2.3088665564177586, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.108985185623169, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8613934516906738, + "num_tokens": 692564794.0, + "step": 18150 + }, + { + "epoch": 2.308993766696349, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7990347146987915, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8543139696121216, + "num_tokens": 692610359.0, + "step": 18151 + }, + { + "epoch": 2.3091209769749397, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8550302982330322, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8724530339241028, + "num_tokens": 692641560.0, + "step": 18152 + }, + { + "epoch": 2.30924818725353, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9401214122772217, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8606083989143372, + "num_tokens": 692680114.0, + "step": 18153 + }, + { + "epoch": 2.3093753975321207, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0758116245269775, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8515859246253967, + "num_tokens": 692714199.0, + "step": 18154 + }, + { + "epoch": 2.3095026078107113, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8325773477554321, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8674763441085815, + "num_tokens": 692752331.0, + "step": 18155 + }, + { + "epoch": 2.309629818089302, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8707129955291748, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.870428204536438, + "num_tokens": 692789167.0, + "step": 18156 + }, + { + "epoch": 2.3097570283678923, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7543036937713623, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8635154962539673, + "num_tokens": 692829234.0, + "step": 18157 + }, + { + "epoch": 2.3098842386464824, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.011514663696289, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8426946997642517, + "num_tokens": 692863749.0, + "step": 18158 + }, + { + "epoch": 2.3100114489250734, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.941067099571228, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8614783883094788, + "num_tokens": 692900972.0, + "step": 18159 + }, + { + "epoch": 2.3101386592036635, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.895782470703125, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8579563498497009, + "num_tokens": 692938522.0, + "step": 18160 + }, + { + "epoch": 2.310265869482254, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2947347164154053, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8727395534515381, + "num_tokens": 692973653.0, + "step": 18161 + }, + { + "epoch": 2.3103930797608445, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8342039585113525, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8570153713226318, + "num_tokens": 693016325.0, + "step": 18162 + }, + { + "epoch": 2.310520290039435, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8664112091064453, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8752788305282593, + "num_tokens": 693052022.0, + "step": 18163 + }, + { + "epoch": 2.3106475003180256, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.846880555152893, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8682364225387573, + "num_tokens": 693093936.0, + "step": 18164 + }, + { + "epoch": 2.310774710596616, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9844813346862793, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8782721161842346, + "num_tokens": 693130160.0, + "step": 18165 + }, + { + "epoch": 2.3109019208752066, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9137526750564575, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8662669062614441, + "num_tokens": 693172405.0, + "step": 18166 + }, + { + "epoch": 2.311029131153797, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0378577709198, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8667296171188354, + "num_tokens": 693206499.0, + "step": 18167 + }, + { + "epoch": 2.3111563414323877, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0478289127349854, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8637371063232422, + "num_tokens": 693244287.0, + "step": 18168 + }, + { + "epoch": 2.311283551710978, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8725553750991821, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8558971285820007, + "num_tokens": 693286164.0, + "step": 18169 + }, + { + "epoch": 2.3114107619895687, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7921401262283325, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8713856339454651, + "num_tokens": 693330961.0, + "step": 18170 + }, + { + "epoch": 2.3115379722681593, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7368171215057373, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8694450855255127, + "num_tokens": 693375169.0, + "step": 18171 + }, + { + "epoch": 2.31166518254675, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9355531930923462, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8584058284759521, + "num_tokens": 693414127.0, + "step": 18172 + }, + { + "epoch": 2.3117923928253403, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9954524040222168, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8609846830368042, + "num_tokens": 693450446.0, + "step": 18173 + }, + { + "epoch": 2.311919603103931, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.615386962890625, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8551573753356934, + "num_tokens": 693490973.0, + "step": 18174 + }, + { + "epoch": 2.3120468133825214, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.149834156036377, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8668081760406494, + "num_tokens": 693526815.0, + "step": 18175 + }, + { + "epoch": 2.312174023661112, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.962180733680725, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8576211929321289, + "num_tokens": 693566949.0, + "step": 18176 + }, + { + "epoch": 2.3123012339397024, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.4409782886505127, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.856669008731842, + "num_tokens": 693601116.0, + "step": 18177 + }, + { + "epoch": 2.312428444218293, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8791927099227905, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8937320113182068, + "num_tokens": 693640588.0, + "step": 18178 + }, + { + "epoch": 2.3125556544968835, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0190958976745605, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8645044565200806, + "num_tokens": 693683352.0, + "step": 18179 + }, + { + "epoch": 2.312682864775474, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7995800971984863, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8691359758377075, + "num_tokens": 693722579.0, + "step": 18180 + }, + { + "epoch": 2.3128100750540646, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.876715064048767, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8651574850082397, + "num_tokens": 693766743.0, + "step": 18181 + }, + { + "epoch": 2.312937285332655, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7107213735580444, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8668516874313354, + "num_tokens": 693809699.0, + "step": 18182 + }, + { + "epoch": 2.313064495611245, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9521675109863281, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.868942141532898, + "num_tokens": 693845508.0, + "step": 18183 + }, + { + "epoch": 2.313191705889836, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.075350284576416, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8641893863677979, + "num_tokens": 693883763.0, + "step": 18184 + }, + { + "epoch": 2.313318916168426, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.005152940750122, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8700826168060303, + "num_tokens": 693917093.0, + "step": 18185 + }, + { + "epoch": 2.3134461264470167, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4158883094787598, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8648489713668823, + "num_tokens": 693949219.0, + "step": 18186 + }, + { + "epoch": 2.3135733367256073, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8032501935958862, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.857505202293396, + "num_tokens": 693992755.0, + "step": 18187 + }, + { + "epoch": 2.313700547004198, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0683462619781494, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8507426381111145, + "num_tokens": 694030386.0, + "step": 18188 + }, + { + "epoch": 2.3138277572827883, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.878785490989685, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.872456431388855, + "num_tokens": 694065019.0, + "step": 18189 + }, + { + "epoch": 2.313954967561379, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.810708999633789, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8614590167999268, + "num_tokens": 694106645.0, + "step": 18190 + }, + { + "epoch": 2.3140821778399694, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.776570439338684, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8686836361885071, + "num_tokens": 694149146.0, + "step": 18191 + }, + { + "epoch": 2.31420938811856, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8070272207260132, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8484998941421509, + "num_tokens": 694193567.0, + "step": 18192 + }, + { + "epoch": 2.3143365983971504, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.883381724357605, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8759411573410034, + "num_tokens": 694232160.0, + "step": 18193 + }, + { + "epoch": 2.314463808675741, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7335350513458252, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8657522201538086, + "num_tokens": 694275153.0, + "step": 18194 + }, + { + "epoch": 2.3145910189543315, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.772756814956665, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8707184791564941, + "num_tokens": 694313722.0, + "step": 18195 + }, + { + "epoch": 2.314718229232922, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8667680025100708, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.868437647819519, + "num_tokens": 694350544.0, + "step": 18196 + }, + { + "epoch": 2.3148454395115126, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9413796663284302, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8655409812927246, + "num_tokens": 694386552.0, + "step": 18197 + }, + { + "epoch": 2.314972649790103, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8291261196136475, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8601795434951782, + "num_tokens": 694434368.0, + "step": 18198 + }, + { + "epoch": 2.3150998600686936, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9308072328567505, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8637745380401611, + "num_tokens": 694479856.0, + "step": 18199 + }, + { + "epoch": 2.315227070347284, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8599720001220703, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8639456033706665, + "num_tokens": 694521107.0, + "step": 18200 + }, + { + "epoch": 2.3153542806258747, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0700433254241943, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8648169040679932, + "num_tokens": 694560886.0, + "step": 18201 + }, + { + "epoch": 2.315481490904465, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.878909707069397, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8684327602386475, + "num_tokens": 694598404.0, + "step": 18202 + }, + { + "epoch": 2.3156087011830557, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8602992296218872, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8591165542602539, + "num_tokens": 694637162.0, + "step": 18203 + }, + { + "epoch": 2.3157359114616463, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.983023762702942, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8676708340644836, + "num_tokens": 694677435.0, + "step": 18204 + }, + { + "epoch": 2.315863121740237, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8621630668640137, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8736406564712524, + "num_tokens": 694713980.0, + "step": 18205 + }, + { + "epoch": 2.3159903320188273, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9215091466903687, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8607043623924255, + "num_tokens": 694753884.0, + "step": 18206 + }, + { + "epoch": 2.316117542297418, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9186761379241943, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8780562877655029, + "num_tokens": 694785739.0, + "step": 18207 + }, + { + "epoch": 2.316244752576008, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9559201002120972, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8640190362930298, + "num_tokens": 694827629.0, + "step": 18208 + }, + { + "epoch": 2.316371962854599, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.855713963508606, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.860924243927002, + "num_tokens": 694868241.0, + "step": 18209 + }, + { + "epoch": 2.316499173133189, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9696933031082153, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8542319536209106, + "num_tokens": 694905298.0, + "step": 18210 + }, + { + "epoch": 2.3166263834117795, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0830681324005127, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8570749759674072, + "num_tokens": 694938054.0, + "step": 18211 + }, + { + "epoch": 2.31675359369037, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9086244106292725, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8597416877746582, + "num_tokens": 694978761.0, + "step": 18212 + }, + { + "epoch": 2.3168808039689606, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9540486335754395, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8602855801582336, + "num_tokens": 695015073.0, + "step": 18213 + }, + { + "epoch": 2.317008014247551, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0806355476379395, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8586028218269348, + "num_tokens": 695052288.0, + "step": 18214 + }, + { + "epoch": 2.3171352245261416, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9656566381454468, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8536814451217651, + "num_tokens": 695088976.0, + "step": 18215 + }, + { + "epoch": 2.317262434804732, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8887676000595093, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8676843643188477, + "num_tokens": 695128482.0, + "step": 18216 + }, + { + "epoch": 2.3173896450833227, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0032455921173096, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8606250286102295, + "num_tokens": 695164963.0, + "step": 18217 + }, + { + "epoch": 2.317516855361913, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.838434100151062, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8856908082962036, + "num_tokens": 695201515.0, + "step": 18218 + }, + { + "epoch": 2.3176440656405037, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.808199405670166, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8697787523269653, + "num_tokens": 695242057.0, + "step": 18219 + }, + { + "epoch": 2.3177712759190943, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.905417799949646, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.864870548248291, + "num_tokens": 695277128.0, + "step": 18220 + }, + { + "epoch": 2.317898486197685, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.136615753173828, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.878128707408905, + "num_tokens": 695311984.0, + "step": 18221 + }, + { + "epoch": 2.3180256964762753, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0001285076141357, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8522580862045288, + "num_tokens": 695348457.0, + "step": 18222 + }, + { + "epoch": 2.318152906754866, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9412235021591187, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8664716482162476, + "num_tokens": 695386518.0, + "step": 18223 + }, + { + "epoch": 2.3182801170334564, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0207602977752686, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8599510788917542, + "num_tokens": 695425365.0, + "step": 18224 + }, + { + "epoch": 2.318407327312047, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.184642791748047, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8652093410491943, + "num_tokens": 695459559.0, + "step": 18225 + }, + { + "epoch": 2.3185345375906374, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4290387630462646, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8600999116897583, + "num_tokens": 695491386.0, + "step": 18226 + }, + { + "epoch": 2.318661747869228, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9383246898651123, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8471271991729736, + "num_tokens": 695534660.0, + "step": 18227 + }, + { + "epoch": 2.3187889581478185, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9003033638000488, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8561029434204102, + "num_tokens": 695578127.0, + "step": 18228 + }, + { + "epoch": 2.318916168426409, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9815119504928589, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.85207599401474, + "num_tokens": 695614181.0, + "step": 18229 + }, + { + "epoch": 2.3190433787049995, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8809747695922852, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8633725643157959, + "num_tokens": 695652753.0, + "step": 18230 + }, + { + "epoch": 2.3191705889835896, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8890752792358398, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8713867664337158, + "num_tokens": 695692449.0, + "step": 18231 + }, + { + "epoch": 2.3192977992621806, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.035980224609375, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8656784296035767, + "num_tokens": 695730812.0, + "step": 18232 + }, + { + "epoch": 2.3194250095407707, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.909529209136963, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.879131019115448, + "num_tokens": 695767577.0, + "step": 18233 + }, + { + "epoch": 2.319552219819361, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8975590467453003, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.864014208316803, + "num_tokens": 695812372.0, + "step": 18234 + }, + { + "epoch": 2.3196794300979517, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8313210010528564, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8814545273780823, + "num_tokens": 695847470.0, + "step": 18235 + }, + { + "epoch": 2.3198066403765423, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7420092821121216, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8808480501174927, + "num_tokens": 695888082.0, + "step": 18236 + }, + { + "epoch": 2.319933850655133, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8060654401779175, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8723901510238647, + "num_tokens": 695928904.0, + "step": 18237 + }, + { + "epoch": 2.3200610609337233, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9489843845367432, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8527926206588745, + "num_tokens": 695967549.0, + "step": 18238 + }, + { + "epoch": 2.320188271212314, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9636296033859253, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8587692379951477, + "num_tokens": 696005611.0, + "step": 18239 + }, + { + "epoch": 2.3203154814909044, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.14097261428833, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8678233027458191, + "num_tokens": 696034864.0, + "step": 18240 + }, + { + "epoch": 2.320442691769495, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9470672607421875, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.861301839351654, + "num_tokens": 696072137.0, + "step": 18241 + }, + { + "epoch": 2.3205699020480854, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.837839961051941, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8668259382247925, + "num_tokens": 696115744.0, + "step": 18242 + }, + { + "epoch": 2.320697112326676, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.004192352294922, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8596053123474121, + "num_tokens": 696151293.0, + "step": 18243 + }, + { + "epoch": 2.3208243226052665, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8601123094558716, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8821576833724976, + "num_tokens": 696190742.0, + "step": 18244 + }, + { + "epoch": 2.320951532883857, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9072315692901611, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8523435592651367, + "num_tokens": 696232807.0, + "step": 18245 + }, + { + "epoch": 2.3210787431624476, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8396480083465576, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.862870991230011, + "num_tokens": 696269416.0, + "step": 18246 + }, + { + "epoch": 2.321205953441038, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9506655931472778, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8670344352722168, + "num_tokens": 696305144.0, + "step": 18247 + }, + { + "epoch": 2.3213331637196286, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.019069194793701, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8556960821151733, + "num_tokens": 696344854.0, + "step": 18248 + }, + { + "epoch": 2.321460373998219, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9064841270446777, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8838157653808594, + "num_tokens": 696380154.0, + "step": 18249 + }, + { + "epoch": 2.3215875842768097, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.04095721244812, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8503984808921814, + "num_tokens": 696416458.0, + "step": 18250 + }, + { + "epoch": 2.3217147945554, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9342150688171387, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8656261563301086, + "num_tokens": 696453232.0, + "step": 18251 + }, + { + "epoch": 2.3218420048339907, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9446704387664795, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8443868160247803, + "num_tokens": 696487930.0, + "step": 18252 + }, + { + "epoch": 2.3219692151125813, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.070600986480713, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8548229932785034, + "num_tokens": 696523824.0, + "step": 18253 + }, + { + "epoch": 2.322096425391172, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8785244226455688, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8734948635101318, + "num_tokens": 696567007.0, + "step": 18254 + }, + { + "epoch": 2.3222236356697623, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.769644021987915, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8812063932418823, + "num_tokens": 696610055.0, + "step": 18255 + }, + { + "epoch": 2.3223508459483524, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9760937690734863, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8508099317550659, + "num_tokens": 696644058.0, + "step": 18256 + }, + { + "epoch": 2.3224780562269434, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.781393051147461, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8722966909408569, + "num_tokens": 696686381.0, + "step": 18257 + }, + { + "epoch": 2.3226052665055335, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8355590105056763, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8586785793304443, + "num_tokens": 696727567.0, + "step": 18258 + }, + { + "epoch": 2.322732476784124, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8788697719573975, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8556584119796753, + "num_tokens": 696767266.0, + "step": 18259 + }, + { + "epoch": 2.3228596870627145, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8047151565551758, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8778845071792603, + "num_tokens": 696806059.0, + "step": 18260 + }, + { + "epoch": 2.322986897341305, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9865601062774658, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8624852299690247, + "num_tokens": 696844232.0, + "step": 18261 + }, + { + "epoch": 2.3231141076198956, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8804346323013306, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8666773438453674, + "num_tokens": 696886223.0, + "step": 18262 + }, + { + "epoch": 2.323241317898486, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8895498514175415, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.850981593132019, + "num_tokens": 696922625.0, + "step": 18263 + }, + { + "epoch": 2.3233685281770766, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8621807098388672, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8522449731826782, + "num_tokens": 696965160.0, + "step": 18264 + }, + { + "epoch": 2.323495738455667, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9783552885055542, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8703311681747437, + "num_tokens": 697000854.0, + "step": 18265 + }, + { + "epoch": 2.3236229487342577, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.850314736366272, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8792220950126648, + "num_tokens": 697041264.0, + "step": 18266 + }, + { + "epoch": 2.323750159012848, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0026051998138428, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8614679574966431, + "num_tokens": 697080079.0, + "step": 18267 + }, + { + "epoch": 2.3238773692914387, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 5.095688343048096, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8605787754058838, + "num_tokens": 697123755.0, + "step": 18268 + }, + { + "epoch": 2.3240045795700293, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.067716598510742, + "learning_rate": 1e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.835463285446167, + "num_tokens": 697162012.0, + "step": 18269 + }, + { + "epoch": 2.32413178984862, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9359130859375, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8734787106513977, + "num_tokens": 697200188.0, + "step": 18270 + }, + { + "epoch": 2.3242590001272103, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0395896434783936, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8667776584625244, + "num_tokens": 697240536.0, + "step": 18271 + }, + { + "epoch": 2.324386210405801, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.90427827835083, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8764230012893677, + "num_tokens": 697279459.0, + "step": 18272 + }, + { + "epoch": 2.3245134206843914, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9178966283798218, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.882502555847168, + "num_tokens": 697312594.0, + "step": 18273 + }, + { + "epoch": 2.324640630962982, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8794379234313965, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8503443598747253, + "num_tokens": 697357834.0, + "step": 18274 + }, + { + "epoch": 2.3247678412415724, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.090482234954834, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8633029460906982, + "num_tokens": 697401522.0, + "step": 18275 + }, + { + "epoch": 2.324895051520163, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.328798532485962, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8613759279251099, + "num_tokens": 697440415.0, + "step": 18276 + }, + { + "epoch": 2.3250222617987535, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3460171222686768, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8481813669204712, + "num_tokens": 697477609.0, + "step": 18277 + }, + { + "epoch": 2.325149472077344, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0190958976745605, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8598767518997192, + "num_tokens": 697515566.0, + "step": 18278 + }, + { + "epoch": 2.3252766823559345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9752072095870972, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8681377172470093, + "num_tokens": 697549537.0, + "step": 18279 + }, + { + "epoch": 2.325403892634525, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.977515459060669, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8684336543083191, + "num_tokens": 697582618.0, + "step": 18280 + }, + { + "epoch": 2.325531102913115, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8192085027694702, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8780691027641296, + "num_tokens": 697622234.0, + "step": 18281 + }, + { + "epoch": 2.325658313191706, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9329379796981812, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8550518155097961, + "num_tokens": 697657504.0, + "step": 18282 + }, + { + "epoch": 2.325785523470296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9784079790115356, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8510406017303467, + "num_tokens": 697699496.0, + "step": 18283 + }, + { + "epoch": 2.3259127337488867, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8610968589782715, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8780577182769775, + "num_tokens": 697739969.0, + "step": 18284 + }, + { + "epoch": 2.3260399440274773, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9386345148086548, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8537009358406067, + "num_tokens": 697774346.0, + "step": 18285 + }, + { + "epoch": 2.326167154306068, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9417319297790527, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8499107360839844, + "num_tokens": 697813013.0, + "step": 18286 + }, + { + "epoch": 2.3262943645846583, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8158807754516602, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8574172258377075, + "num_tokens": 697854163.0, + "step": 18287 + }, + { + "epoch": 2.326421574863249, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7930023670196533, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8720394372940063, + "num_tokens": 697898049.0, + "step": 18288 + }, + { + "epoch": 2.3265487851418394, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8087081909179688, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8675408363342285, + "num_tokens": 697936086.0, + "step": 18289 + }, + { + "epoch": 2.32667599542043, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1797587871551514, + "learning_rate": 1e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8419497013092041, + "num_tokens": 697981529.0, + "step": 18290 + }, + { + "epoch": 2.3268032056990204, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9252235889434814, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8808231353759766, + "num_tokens": 698016047.0, + "step": 18291 + }, + { + "epoch": 2.326930415977611, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.967867374420166, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8763245940208435, + "num_tokens": 698052408.0, + "step": 18292 + }, + { + "epoch": 2.3270576262562015, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9230974912643433, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8570623397827148, + "num_tokens": 698089751.0, + "step": 18293 + }, + { + "epoch": 2.327184836534792, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9878439903259277, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8616294264793396, + "num_tokens": 698121274.0, + "step": 18294 + }, + { + "epoch": 2.3273120468133826, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9766407012939453, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8735830783843994, + "num_tokens": 698155880.0, + "step": 18295 + }, + { + "epoch": 2.327439257091973, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.067211866378784, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8672285079956055, + "num_tokens": 698191448.0, + "step": 18296 + }, + { + "epoch": 2.3275664673705636, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.939079999923706, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8606438636779785, + "num_tokens": 698228437.0, + "step": 18297 + }, + { + "epoch": 2.327693677649154, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9217169284820557, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8562349081039429, + "num_tokens": 698268714.0, + "step": 18298 + }, + { + "epoch": 2.3278208879277447, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.888789176940918, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8645283579826355, + "num_tokens": 698305575.0, + "step": 18299 + }, + { + "epoch": 2.327948098206335, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.119208335876465, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8608590364456177, + "num_tokens": 698347701.0, + "step": 18300 + }, + { + "epoch": 2.3280753084849257, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7993316650390625, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8809150457382202, + "num_tokens": 698385906.0, + "step": 18301 + }, + { + "epoch": 2.3282025187635162, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.096095085144043, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8639425039291382, + "num_tokens": 698415511.0, + "step": 18302 + }, + { + "epoch": 2.3283297290421068, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8486652374267578, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8596485257148743, + "num_tokens": 698460528.0, + "step": 18303 + }, + { + "epoch": 2.3284569393206973, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9305789470672607, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8486238121986389, + "num_tokens": 698500800.0, + "step": 18304 + }, + { + "epoch": 2.328584149599288, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.974006175994873, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.863591194152832, + "num_tokens": 698539373.0, + "step": 18305 + }, + { + "epoch": 2.328711359877878, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7928318977355957, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8853837251663208, + "num_tokens": 698577067.0, + "step": 18306 + }, + { + "epoch": 2.328838570156469, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9376943111419678, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.863930881023407, + "num_tokens": 698613515.0, + "step": 18307 + }, + { + "epoch": 2.328965780435059, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0044591426849365, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8651989102363586, + "num_tokens": 698647278.0, + "step": 18308 + }, + { + "epoch": 2.3290929907136495, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8838768005371094, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8732354044914246, + "num_tokens": 698688738.0, + "step": 18309 + }, + { + "epoch": 2.32922020099224, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.87627375125885, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8665996193885803, + "num_tokens": 698725003.0, + "step": 18310 + }, + { + "epoch": 2.3293474112708306, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7455652952194214, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8475159406661987, + "num_tokens": 698768205.0, + "step": 18311 + }, + { + "epoch": 2.329474621549421, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9862314462661743, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8452143669128418, + "num_tokens": 698805352.0, + "step": 18312 + }, + { + "epoch": 2.3296018318280116, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0262091159820557, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8748743534088135, + "num_tokens": 698836610.0, + "step": 18313 + }, + { + "epoch": 2.329729042106602, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0918097496032715, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8733712434768677, + "num_tokens": 698877514.0, + "step": 18314 + }, + { + "epoch": 2.3298562523851927, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.894113302230835, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8673515319824219, + "num_tokens": 698916699.0, + "step": 18315 + }, + { + "epoch": 2.329983462663783, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8973716497421265, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8664381504058838, + "num_tokens": 698953568.0, + "step": 18316 + }, + { + "epoch": 2.3301106729423737, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8462830781936646, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8596659898757935, + "num_tokens": 698990562.0, + "step": 18317 + }, + { + "epoch": 2.3302378832209643, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9925963878631592, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8675193786621094, + "num_tokens": 699024095.0, + "step": 18318 + }, + { + "epoch": 2.330365093499555, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0013325214385986, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8438715934753418, + "num_tokens": 699062327.0, + "step": 18319 + }, + { + "epoch": 2.3304923037781453, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8357905149459839, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8676834106445312, + "num_tokens": 699101591.0, + "step": 18320 + }, + { + "epoch": 2.330619514056736, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.101623296737671, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8557161092758179, + "num_tokens": 699134836.0, + "step": 18321 + }, + { + "epoch": 2.3307467243353264, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9354218244552612, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8547931909561157, + "num_tokens": 699171218.0, + "step": 18322 + }, + { + "epoch": 2.330873934613917, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.094391107559204, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8824491500854492, + "num_tokens": 699200891.0, + "step": 18323 + }, + { + "epoch": 2.3310011448925074, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.967430591583252, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8803433179855347, + "num_tokens": 699238080.0, + "step": 18324 + }, + { + "epoch": 2.331128355171098, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0103566646575928, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8532534837722778, + "num_tokens": 699276051.0, + "step": 18325 + }, + { + "epoch": 2.3312555654496885, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9243769645690918, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8719364404678345, + "num_tokens": 699315721.0, + "step": 18326 + }, + { + "epoch": 2.331382775728279, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9338042736053467, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8745240569114685, + "num_tokens": 699350342.0, + "step": 18327 + }, + { + "epoch": 2.3315099860068695, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9761940240859985, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8701043128967285, + "num_tokens": 699390301.0, + "step": 18328 + }, + { + "epoch": 2.3316371962854596, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.624340057373047, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8489634394645691, + "num_tokens": 699427135.0, + "step": 18329 + }, + { + "epoch": 2.3317644065640506, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0001156330108643, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.870120108127594, + "num_tokens": 699466461.0, + "step": 18330 + }, + { + "epoch": 2.3318916168426407, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9768754243850708, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8577662110328674, + "num_tokens": 699502446.0, + "step": 18331 + }, + { + "epoch": 2.332018827121231, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7455869913101196, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8637475371360779, + "num_tokens": 699546357.0, + "step": 18332 + }, + { + "epoch": 2.3321460373998217, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.5470073223114014, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8644390106201172, + "num_tokens": 699581879.0, + "step": 18333 + }, + { + "epoch": 2.3322732476784123, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.990987777709961, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.878132700920105, + "num_tokens": 699616287.0, + "step": 18334 + }, + { + "epoch": 2.332400457957003, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.833205223083496, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8611837029457092, + "num_tokens": 699655409.0, + "step": 18335 + }, + { + "epoch": 2.3325276682355933, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8685083389282227, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.838765025138855, + "num_tokens": 699701420.0, + "step": 18336 + }, + { + "epoch": 2.332654878514184, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9565200805664062, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.867900013923645, + "num_tokens": 699736151.0, + "step": 18337 + }, + { + "epoch": 2.3327820887927744, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.127246141433716, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8567762970924377, + "num_tokens": 699770111.0, + "step": 18338 + }, + { + "epoch": 2.332909299071365, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7859350442886353, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.861628532409668, + "num_tokens": 699807442.0, + "step": 18339 + }, + { + "epoch": 2.3330365093499554, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7778229713439941, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8628028631210327, + "num_tokens": 699849364.0, + "step": 18340 + }, + { + "epoch": 2.333163719628546, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8133223056793213, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8776143193244934, + "num_tokens": 699886234.0, + "step": 18341 + }, + { + "epoch": 2.3332909299071365, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9548348188400269, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8458968997001648, + "num_tokens": 699925660.0, + "step": 18342 + }, + { + "epoch": 2.333418140185727, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.538966417312622, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8385592699050903, + "num_tokens": 699967894.0, + "step": 18343 + }, + { + "epoch": 2.3335453504643175, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0312275886535645, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8585366010665894, + "num_tokens": 699999211.0, + "step": 18344 + }, + { + "epoch": 2.333672560742908, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.863108515739441, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.871698260307312, + "num_tokens": 700037639.0, + "step": 18345 + }, + { + "epoch": 2.3337997710214986, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8907485008239746, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8570952415466309, + "num_tokens": 700075677.0, + "step": 18346 + }, + { + "epoch": 2.333926981300089, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8215819597244263, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8704394102096558, + "num_tokens": 700115697.0, + "step": 18347 + }, + { + "epoch": 2.3340541915786797, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.937333583831787, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8464462757110596, + "num_tokens": 700152553.0, + "step": 18348 + }, + { + "epoch": 2.33418140185727, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7701138257980347, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8716887831687927, + "num_tokens": 700190958.0, + "step": 18349 + }, + { + "epoch": 2.3343086121358607, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0403103828430176, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.860398530960083, + "num_tokens": 700224244.0, + "step": 18350 + }, + { + "epoch": 2.3344358224144512, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.956543207168579, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8902081251144409, + "num_tokens": 700259736.0, + "step": 18351 + }, + { + "epoch": 2.3345630326930418, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.046616554260254, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8645088076591492, + "num_tokens": 700293624.0, + "step": 18352 + }, + { + "epoch": 2.3346902429716323, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9129220247268677, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8670197129249573, + "num_tokens": 700328052.0, + "step": 18353 + }, + { + "epoch": 2.3348174532502224, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9446473121643066, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8575026988983154, + "num_tokens": 700364652.0, + "step": 18354 + }, + { + "epoch": 2.3349446635288134, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8882120847702026, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8774320483207703, + "num_tokens": 700403698.0, + "step": 18355 + }, + { + "epoch": 2.3350718738074034, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.266838312149048, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8777748346328735, + "num_tokens": 700439391.0, + "step": 18356 + }, + { + "epoch": 2.335199084085994, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.072169542312622, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.86542809009552, + "num_tokens": 700475095.0, + "step": 18357 + }, + { + "epoch": 2.3353262943645845, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8607081174850464, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.869206964969635, + "num_tokens": 700514167.0, + "step": 18358 + }, + { + "epoch": 2.335453504643175, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0600745677948, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8732169270515442, + "num_tokens": 700548333.0, + "step": 18359 + }, + { + "epoch": 2.3355807149217656, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7850600481033325, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8685102462768555, + "num_tokens": 700594261.0, + "step": 18360 + }, + { + "epoch": 2.335707925200356, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.027585983276367, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8708978891372681, + "num_tokens": 700632513.0, + "step": 18361 + }, + { + "epoch": 2.3358351354789466, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8554309606552124, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8709062337875366, + "num_tokens": 700672430.0, + "step": 18362 + }, + { + "epoch": 2.335962345757537, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9666486978530884, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.870136559009552, + "num_tokens": 700714321.0, + "step": 18363 + }, + { + "epoch": 2.3360895560361277, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.940993070602417, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8642992377281189, + "num_tokens": 700751714.0, + "step": 18364 + }, + { + "epoch": 2.336216766314718, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9321811199188232, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.862430214881897, + "num_tokens": 700794087.0, + "step": 18365 + }, + { + "epoch": 2.3363439765933087, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.023618459701538, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8583331108093262, + "num_tokens": 700832031.0, + "step": 18366 + }, + { + "epoch": 2.3364711868718993, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9165674448013306, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8676287531852722, + "num_tokens": 700867175.0, + "step": 18367 + }, + { + "epoch": 2.33659839715049, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8440158367156982, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8702303767204285, + "num_tokens": 700906762.0, + "step": 18368 + }, + { + "epoch": 2.3367256074290803, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8198492527008057, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8673962354660034, + "num_tokens": 700944316.0, + "step": 18369 + }, + { + "epoch": 2.336852817707671, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9938218593597412, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8614116907119751, + "num_tokens": 700979369.0, + "step": 18370 + }, + { + "epoch": 2.3369800279862614, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.623626708984375, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8618646860122681, + "num_tokens": 701016213.0, + "step": 18371 + }, + { + "epoch": 2.337107238264852, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.258528709411621, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8615992069244385, + "num_tokens": 701055270.0, + "step": 18372 + }, + { + "epoch": 2.3372344485434424, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9919934272766113, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.861896276473999, + "num_tokens": 701091058.0, + "step": 18373 + }, + { + "epoch": 2.337361658822033, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9028373956680298, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8726263046264648, + "num_tokens": 701129446.0, + "step": 18374 + }, + { + "epoch": 2.3374888691006235, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.984803318977356, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8696187734603882, + "num_tokens": 701168726.0, + "step": 18375 + }, + { + "epoch": 2.337616079379214, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1735000610351562, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8592950701713562, + "num_tokens": 701204041.0, + "step": 18376 + }, + { + "epoch": 2.3377432896578045, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9545323848724365, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8416435122489929, + "num_tokens": 701244660.0, + "step": 18377 + }, + { + "epoch": 2.337870499936395, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9177802801132202, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8551772236824036, + "num_tokens": 701284830.0, + "step": 18378 + }, + { + "epoch": 2.337997710214985, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9229521751403809, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8711000680923462, + "num_tokens": 701319720.0, + "step": 18379 + }, + { + "epoch": 2.338124920493576, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8208904266357422, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8445234894752502, + "num_tokens": 701361770.0, + "step": 18380 + }, + { + "epoch": 2.338252130772166, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9047539234161377, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.862686276435852, + "num_tokens": 701399777.0, + "step": 18381 + }, + { + "epoch": 2.3383793410507567, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8823034763336182, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8638032674789429, + "num_tokens": 701432680.0, + "step": 18382 + }, + { + "epoch": 2.3385065513293473, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7865463495254517, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.875754714012146, + "num_tokens": 701474101.0, + "step": 18383 + }, + { + "epoch": 2.338633761607938, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8899810314178467, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8763397932052612, + "num_tokens": 701517568.0, + "step": 18384 + }, + { + "epoch": 2.3387609718865283, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8916932344436646, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8592642545700073, + "num_tokens": 701558466.0, + "step": 18385 + }, + { + "epoch": 2.338888182165119, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0748069286346436, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.858842134475708, + "num_tokens": 701591104.0, + "step": 18386 + }, + { + "epoch": 2.3390153924437094, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9834048748016357, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8574613928794861, + "num_tokens": 701633242.0, + "step": 18387 + }, + { + "epoch": 2.3391426027223, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9297012090682983, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8634254336357117, + "num_tokens": 701668864.0, + "step": 18388 + }, + { + "epoch": 2.3392698130008904, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2846152782440186, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8670884370803833, + "num_tokens": 701703648.0, + "step": 18389 + }, + { + "epoch": 2.339397023279481, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.003526449203491, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8618322610855103, + "num_tokens": 701744417.0, + "step": 18390 + }, + { + "epoch": 2.3395242335580715, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.009657382965088, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8318300247192383, + "num_tokens": 701783541.0, + "step": 18391 + }, + { + "epoch": 2.339651443836662, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.922004222869873, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8606104254722595, + "num_tokens": 701825054.0, + "step": 18392 + }, + { + "epoch": 2.3397786541152525, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.773590087890625, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8727121949195862, + "num_tokens": 701866173.0, + "step": 18393 + }, + { + "epoch": 2.339905864393843, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9677833318710327, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.860136866569519, + "num_tokens": 701904812.0, + "step": 18394 + }, + { + "epoch": 2.3400330746724336, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9202601909637451, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8538239002227783, + "num_tokens": 701942590.0, + "step": 18395 + }, + { + "epoch": 2.340160284951024, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0115463733673096, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8681316375732422, + "num_tokens": 701980773.0, + "step": 18396 + }, + { + "epoch": 2.3402874952296147, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8763542175292969, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8756079077720642, + "num_tokens": 702014508.0, + "step": 18397 + }, + { + "epoch": 2.340414705508205, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8197675943374634, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8766425848007202, + "num_tokens": 702055754.0, + "step": 18398 + }, + { + "epoch": 2.3405419157867957, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8113796710968018, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8645479679107666, + "num_tokens": 702100022.0, + "step": 18399 + }, + { + "epoch": 2.3406691260653862, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1495912075042725, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8519625663757324, + "num_tokens": 702133217.0, + "step": 18400 + }, + { + "epoch": 2.3407963363439768, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7492644786834717, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8773821592330933, + "num_tokens": 702176998.0, + "step": 18401 + }, + { + "epoch": 2.3409235466225673, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.953223705291748, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8705412149429321, + "num_tokens": 702216219.0, + "step": 18402 + }, + { + "epoch": 2.341050756901158, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1467998027801514, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8595610857009888, + "num_tokens": 702246502.0, + "step": 18403 + }, + { + "epoch": 2.341177967179748, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1337716579437256, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8716806769371033, + "num_tokens": 702277990.0, + "step": 18404 + }, + { + "epoch": 2.341305177458339, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.095518112182617, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8659769296646118, + "num_tokens": 702312742.0, + "step": 18405 + }, + { + "epoch": 2.341432387736929, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.959460735321045, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8802459239959717, + "num_tokens": 702348575.0, + "step": 18406 + }, + { + "epoch": 2.3415595980155195, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9568419456481934, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8522924780845642, + "num_tokens": 702385956.0, + "step": 18407 + }, + { + "epoch": 2.34168680829411, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.048736572265625, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8705191016197205, + "num_tokens": 702423973.0, + "step": 18408 + }, + { + "epoch": 2.3418140185727006, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1348040103912354, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8597369194030762, + "num_tokens": 702455508.0, + "step": 18409 + }, + { + "epoch": 2.341941228851291, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8994837999343872, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8769333362579346, + "num_tokens": 702493365.0, + "step": 18410 + }, + { + "epoch": 2.3420684391298816, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9413903951644897, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8670200109481812, + "num_tokens": 702528036.0, + "step": 18411 + }, + { + "epoch": 2.342195649408472, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9452767372131348, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8592361211776733, + "num_tokens": 702571238.0, + "step": 18412 + }, + { + "epoch": 2.3423228596870627, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.884702205657959, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8602128624916077, + "num_tokens": 702611247.0, + "step": 18413 + }, + { + "epoch": 2.342450069965653, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9558855295181274, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8658794164657593, + "num_tokens": 702649661.0, + "step": 18414 + }, + { + "epoch": 2.3425772802442437, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0344228744506836, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8566291332244873, + "num_tokens": 702693573.0, + "step": 18415 + }, + { + "epoch": 2.3427044905228342, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6688812971115112, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8743234276771545, + "num_tokens": 702735351.0, + "step": 18416 + }, + { + "epoch": 2.3428317008014248, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8799477815628052, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.870698094367981, + "num_tokens": 702773406.0, + "step": 18417 + }, + { + "epoch": 2.3429589110800153, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9182798862457275, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8716952800750732, + "num_tokens": 702809939.0, + "step": 18418 + }, + { + "epoch": 2.343086121358606, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8223568201065063, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.861945390701294, + "num_tokens": 702851120.0, + "step": 18419 + }, + { + "epoch": 2.3432133316371964, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7536894083023071, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8721625804901123, + "num_tokens": 702889492.0, + "step": 18420 + }, + { + "epoch": 2.343340541915787, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8249948024749756, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.858431339263916, + "num_tokens": 702930486.0, + "step": 18421 + }, + { + "epoch": 2.3434677521943774, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8224397897720337, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8560972809791565, + "num_tokens": 702969866.0, + "step": 18422 + }, + { + "epoch": 2.343594962472968, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.922639012336731, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8555586934089661, + "num_tokens": 703013703.0, + "step": 18423 + }, + { + "epoch": 2.3437221727515585, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8287473917007446, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8556773662567139, + "num_tokens": 703054849.0, + "step": 18424 + }, + { + "epoch": 2.343849383030149, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.048877000808716, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8590713739395142, + "num_tokens": 703088250.0, + "step": 18425 + }, + { + "epoch": 2.3439765933087395, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9902522563934326, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8594793677330017, + "num_tokens": 703124072.0, + "step": 18426 + }, + { + "epoch": 2.3441038035873296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.078472852706909, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8458616733551025, + "num_tokens": 703158869.0, + "step": 18427 + }, + { + "epoch": 2.3442310138659206, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9064278602600098, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.867479681968689, + "num_tokens": 703193342.0, + "step": 18428 + }, + { + "epoch": 2.3443582241445107, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9402461051940918, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8771644830703735, + "num_tokens": 703229899.0, + "step": 18429 + }, + { + "epoch": 2.344485434423101, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8499741554260254, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8628958463668823, + "num_tokens": 703266735.0, + "step": 18430 + }, + { + "epoch": 2.3446126447016917, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9101834297180176, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8606570959091187, + "num_tokens": 703307515.0, + "step": 18431 + }, + { + "epoch": 2.3447398549802823, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0253677368164062, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8741929531097412, + "num_tokens": 703341237.0, + "step": 18432 + }, + { + "epoch": 2.344867065258873, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8249804973602295, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.874122142791748, + "num_tokens": 703378900.0, + "step": 18433 + }, + { + "epoch": 2.3449942755374633, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8356057405471802, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8732560873031616, + "num_tokens": 703415260.0, + "step": 18434 + }, + { + "epoch": 2.345121485816054, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8489853143692017, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8709833025932312, + "num_tokens": 703452399.0, + "step": 18435 + }, + { + "epoch": 2.3452486960946444, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.79518461227417, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8661070466041565, + "num_tokens": 703492700.0, + "step": 18436 + }, + { + "epoch": 2.345375906373235, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7931110858917236, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8755925893783569, + "num_tokens": 703530805.0, + "step": 18437 + }, + { + "epoch": 2.3455031166518254, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9484434127807617, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8732938766479492, + "num_tokens": 703571894.0, + "step": 18438 + }, + { + "epoch": 2.345630326930416, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9200119972229004, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8704877495765686, + "num_tokens": 703612583.0, + "step": 18439 + }, + { + "epoch": 2.3457575372090065, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0350430011749268, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8578585386276245, + "num_tokens": 703649100.0, + "step": 18440 + }, + { + "epoch": 2.345884747487597, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1894783973693848, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8790209293365479, + "num_tokens": 703680425.0, + "step": 18441 + }, + { + "epoch": 2.3460119577661875, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8881454467773438, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8718938827514648, + "num_tokens": 703721819.0, + "step": 18442 + }, + { + "epoch": 2.346139168044778, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9368748664855957, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8620336055755615, + "num_tokens": 703755432.0, + "step": 18443 + }, + { + "epoch": 2.3462663783233686, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.791638731956482, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8639965057373047, + "num_tokens": 703795903.0, + "step": 18444 + }, + { + "epoch": 2.346393588601959, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.759596824645996, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8766165971755981, + "num_tokens": 703836074.0, + "step": 18445 + }, + { + "epoch": 2.3465207988805497, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8827768564224243, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8708202242851257, + "num_tokens": 703870609.0, + "step": 18446 + }, + { + "epoch": 2.34664800915914, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8880016803741455, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8694791197776794, + "num_tokens": 703909034.0, + "step": 18447 + }, + { + "epoch": 2.3467752194377307, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9801121950149536, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.858013391494751, + "num_tokens": 703946976.0, + "step": 18448 + }, + { + "epoch": 2.3469024297163212, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9075500965118408, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8674757480621338, + "num_tokens": 703986379.0, + "step": 18449 + }, + { + "epoch": 2.3470296399949118, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8480664491653442, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.883375883102417, + "num_tokens": 704019461.0, + "step": 18450 + }, + { + "epoch": 2.3471568502735023, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.27480411529541, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8537390828132629, + "num_tokens": 704053411.0, + "step": 18451 + }, + { + "epoch": 2.3472840605520924, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9108002185821533, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8691287040710449, + "num_tokens": 704092633.0, + "step": 18452 + }, + { + "epoch": 2.3474112708306833, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.012101173400879, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8678593635559082, + "num_tokens": 704126955.0, + "step": 18453 + }, + { + "epoch": 2.3475384811092734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.908668875694275, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8808457851409912, + "num_tokens": 704162222.0, + "step": 18454 + }, + { + "epoch": 2.347665691387864, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9349652528762817, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8651366233825684, + "num_tokens": 704198949.0, + "step": 18455 + }, + { + "epoch": 2.3477929016664545, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9710657596588135, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8599307537078857, + "num_tokens": 704235327.0, + "step": 18456 + }, + { + "epoch": 2.347920111945045, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8907028436660767, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8718662261962891, + "num_tokens": 704272783.0, + "step": 18457 + }, + { + "epoch": 2.3480473222236355, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9177474975585938, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8468818664550781, + "num_tokens": 704314396.0, + "step": 18458 + }, + { + "epoch": 2.348174532502226, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8327593803405762, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8812329769134521, + "num_tokens": 704346903.0, + "step": 18459 + }, + { + "epoch": 2.3483017427808166, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8532021045684814, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8735235333442688, + "num_tokens": 704380456.0, + "step": 18460 + }, + { + "epoch": 2.348428953059407, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8544142246246338, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8778433799743652, + "num_tokens": 704416403.0, + "step": 18461 + }, + { + "epoch": 2.3485561633379977, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.026549816131592, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8666536808013916, + "num_tokens": 704460431.0, + "step": 18462 + }, + { + "epoch": 2.348683373616588, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.946940541267395, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8730593919754028, + "num_tokens": 704497872.0, + "step": 18463 + }, + { + "epoch": 2.3488105838951787, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8521366119384766, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8672903776168823, + "num_tokens": 704538180.0, + "step": 18464 + }, + { + "epoch": 2.3489377941737692, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.036766529083252, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.878049910068512, + "num_tokens": 704578342.0, + "step": 18465 + }, + { + "epoch": 2.3490650044523598, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9290571212768555, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8756885528564453, + "num_tokens": 704611792.0, + "step": 18466 + }, + { + "epoch": 2.3491922147309503, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9241029024124146, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8690274953842163, + "num_tokens": 704649898.0, + "step": 18467 + }, + { + "epoch": 2.349319425009541, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2528693675994873, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8736182451248169, + "num_tokens": 704690632.0, + "step": 18468 + }, + { + "epoch": 2.3494466352881314, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2125470638275146, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8546603918075562, + "num_tokens": 704722955.0, + "step": 18469 + }, + { + "epoch": 2.349573845566722, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.848926067352295, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8653967380523682, + "num_tokens": 704762678.0, + "step": 18470 + }, + { + "epoch": 2.3497010558453124, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7123465538024902, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8915918469429016, + "num_tokens": 704805440.0, + "step": 18471 + }, + { + "epoch": 2.349828266123903, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.812632441520691, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8474533557891846, + "num_tokens": 704848153.0, + "step": 18472 + }, + { + "epoch": 2.3499554764024935, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8163095712661743, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8636612892150879, + "num_tokens": 704886739.0, + "step": 18473 + }, + { + "epoch": 2.350082686681084, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9353584051132202, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8656794428825378, + "num_tokens": 704926145.0, + "step": 18474 + }, + { + "epoch": 2.3502098969596745, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8898521661758423, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.848530650138855, + "num_tokens": 704964135.0, + "step": 18475 + }, + { + "epoch": 2.350337107238265, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9758812189102173, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8523702621459961, + "num_tokens": 705002171.0, + "step": 18476 + }, + { + "epoch": 2.350464317516855, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6974149942398071, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8841375112533569, + "num_tokens": 705040660.0, + "step": 18477 + }, + { + "epoch": 2.350591527795446, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9015605449676514, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8717368245124817, + "num_tokens": 705073655.0, + "step": 18478 + }, + { + "epoch": 2.350718738074036, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0481581687927246, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8616036772727966, + "num_tokens": 705115497.0, + "step": 18479 + }, + { + "epoch": 2.3508459483526267, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7897292375564575, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8837558031082153, + "num_tokens": 705152642.0, + "step": 18480 + }, + { + "epoch": 2.3509731586312173, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0314364433288574, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8500849008560181, + "num_tokens": 705187282.0, + "step": 18481 + }, + { + "epoch": 2.351100368909808, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9180023670196533, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8695313930511475, + "num_tokens": 705225635.0, + "step": 18482 + }, + { + "epoch": 2.3512275791883983, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0182712078094482, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8746562600135803, + "num_tokens": 705260376.0, + "step": 18483 + }, + { + "epoch": 2.351354789466989, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0530290603637695, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8641124963760376, + "num_tokens": 705298799.0, + "step": 18484 + }, + { + "epoch": 2.3514819997455794, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9814590215682983, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8472989201545715, + "num_tokens": 705332570.0, + "step": 18485 + }, + { + "epoch": 2.35160921002417, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.950002670288086, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8675430417060852, + "num_tokens": 705371214.0, + "step": 18486 + }, + { + "epoch": 2.3517364203027604, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9595729112625122, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8627907633781433, + "num_tokens": 705405051.0, + "step": 18487 + }, + { + "epoch": 2.351863630581351, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9747322797775269, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8647223711013794, + "num_tokens": 705442460.0, + "step": 18488 + }, + { + "epoch": 2.3519908408599415, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7997108697891235, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.864860475063324, + "num_tokens": 705482929.0, + "step": 18489 + }, + { + "epoch": 2.352118051138532, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.011087417602539, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8596382141113281, + "num_tokens": 705525313.0, + "step": 18490 + }, + { + "epoch": 2.3522452614171225, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9128128290176392, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8617740273475647, + "num_tokens": 705563730.0, + "step": 18491 + }, + { + "epoch": 2.352372471695713, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0299298763275146, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8689785003662109, + "num_tokens": 705602459.0, + "step": 18492 + }, + { + "epoch": 2.3524996819743036, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8502908945083618, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8621962070465088, + "num_tokens": 705636223.0, + "step": 18493 + }, + { + "epoch": 2.352626892252894, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0707008838653564, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8636214733123779, + "num_tokens": 705676178.0, + "step": 18494 + }, + { + "epoch": 2.3527541025314846, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8612720966339111, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8719252347946167, + "num_tokens": 705716687.0, + "step": 18495 + }, + { + "epoch": 2.352881312810075, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.0261728763580322, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8733651638031006, + "num_tokens": 705759163.0, + "step": 18496 + }, + { + "epoch": 2.3530085230886657, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.941603422164917, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8677471876144409, + "num_tokens": 705795982.0, + "step": 18497 + }, + { + "epoch": 2.3531357333672562, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9673197269439697, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8692843914031982, + "num_tokens": 705835601.0, + "step": 18498 + }, + { + "epoch": 2.3532629436458468, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0614097118377686, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8633358478546143, + "num_tokens": 705876858.0, + "step": 18499 + }, + { + "epoch": 2.353390153924437, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9317210912704468, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8566705584526062, + "num_tokens": 705921944.0, + "step": 18500 + }, + { + "epoch": 2.353517364203028, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.80379319190979, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8668429851531982, + "num_tokens": 705967999.0, + "step": 18501 + }, + { + "epoch": 2.353644574481618, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9376227855682373, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8589104413986206, + "num_tokens": 706003370.0, + "step": 18502 + }, + { + "epoch": 2.353771784760209, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.008284568786621, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8530175089836121, + "num_tokens": 706036551.0, + "step": 18503 + }, + { + "epoch": 2.353898995038799, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8258498907089233, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8840019106864929, + "num_tokens": 706075965.0, + "step": 18504 + }, + { + "epoch": 2.3540262053173895, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0396082401275635, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8538597822189331, + "num_tokens": 706117943.0, + "step": 18505 + }, + { + "epoch": 2.35415341559598, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8068439960479736, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8696053624153137, + "num_tokens": 706157183.0, + "step": 18506 + }, + { + "epoch": 2.3542806258745705, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.800253987312317, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8731725811958313, + "num_tokens": 706196878.0, + "step": 18507 + }, + { + "epoch": 2.354407836153161, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.325115919113159, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8525179624557495, + "num_tokens": 706231102.0, + "step": 18508 + }, + { + "epoch": 2.3545350464317516, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7763110399246216, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8732509613037109, + "num_tokens": 706273372.0, + "step": 18509 + }, + { + "epoch": 2.354662256710342, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9485816955566406, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8606510758399963, + "num_tokens": 706308106.0, + "step": 18510 + }, + { + "epoch": 2.3547894669889327, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9433531761169434, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8730089068412781, + "num_tokens": 706344267.0, + "step": 18511 + }, + { + "epoch": 2.354916677267523, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7808717489242554, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8847293257713318, + "num_tokens": 706385019.0, + "step": 18512 + }, + { + "epoch": 2.3550438875461137, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.820981740951538, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8639155626296997, + "num_tokens": 706424913.0, + "step": 18513 + }, + { + "epoch": 2.3551710978247042, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.860645055770874, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8666142225265503, + "num_tokens": 706466064.0, + "step": 18514 + }, + { + "epoch": 2.3552983081032948, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.90409517288208, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.871097207069397, + "num_tokens": 706504572.0, + "step": 18515 + }, + { + "epoch": 2.3554255183818853, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0035898685455322, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8593416810035706, + "num_tokens": 706543901.0, + "step": 18516 + }, + { + "epoch": 2.355552728660476, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9761947393417358, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.859454870223999, + "num_tokens": 706579095.0, + "step": 18517 + }, + { + "epoch": 2.3556799389390664, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.019756555557251, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8574262857437134, + "num_tokens": 706614078.0, + "step": 18518 + }, + { + "epoch": 2.355807149217657, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1752798557281494, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8723587393760681, + "num_tokens": 706652596.0, + "step": 18519 + }, + { + "epoch": 2.3559343594962474, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9005608558654785, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8884677290916443, + "num_tokens": 706691297.0, + "step": 18520 + }, + { + "epoch": 2.356061569774838, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9729737043380737, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8544800877571106, + "num_tokens": 706727344.0, + "step": 18521 + }, + { + "epoch": 2.3561887800534285, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.724004864692688, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8694379329681396, + "num_tokens": 706770411.0, + "step": 18522 + }, + { + "epoch": 2.356315990332019, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.854628324508667, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8698201179504395, + "num_tokens": 706816215.0, + "step": 18523 + }, + { + "epoch": 2.3564432006106095, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8319214582443237, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8605095148086548, + "num_tokens": 706857400.0, + "step": 18524 + }, + { + "epoch": 2.3565704108891996, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8863838911056519, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8580318093299866, + "num_tokens": 706892715.0, + "step": 18525 + }, + { + "epoch": 2.3566976211677906, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8261404037475586, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.865577220916748, + "num_tokens": 706934373.0, + "step": 18526 + }, + { + "epoch": 2.3568248314463807, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8561069965362549, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8672689199447632, + "num_tokens": 706976885.0, + "step": 18527 + }, + { + "epoch": 2.356952041724971, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8200464248657227, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8724758625030518, + "num_tokens": 707016957.0, + "step": 18528 + }, + { + "epoch": 2.3570792520035617, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9032646417617798, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.87563157081604, + "num_tokens": 707053687.0, + "step": 18529 + }, + { + "epoch": 2.3572064622821522, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9550570249557495, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8594448566436768, + "num_tokens": 707087327.0, + "step": 18530 + }, + { + "epoch": 2.3573336725607428, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0840179920196533, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8780394196510315, + "num_tokens": 707124479.0, + "step": 18531 + }, + { + "epoch": 2.3574608828393333, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4349772930145264, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8689863681793213, + "num_tokens": 707162452.0, + "step": 18532 + }, + { + "epoch": 2.357588093117924, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.12200927734375, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8744728565216064, + "num_tokens": 707196765.0, + "step": 18533 + }, + { + "epoch": 2.3577153033965144, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8983088731765747, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.872949481010437, + "num_tokens": 707230151.0, + "step": 18534 + }, + { + "epoch": 2.357842513675105, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9949162006378174, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8482078313827515, + "num_tokens": 707264373.0, + "step": 18535 + }, + { + "epoch": 2.3579697239536954, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.847211480140686, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8611350059509277, + "num_tokens": 707304844.0, + "step": 18536 + }, + { + "epoch": 2.358096934232286, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8430083990097046, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8678288459777832, + "num_tokens": 707347081.0, + "step": 18537 + }, + { + "epoch": 2.3582241445108765, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 7.800595283508301, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8587274551391602, + "num_tokens": 707388243.0, + "step": 18538 + }, + { + "epoch": 2.358351354789467, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1048567295074463, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8535547256469727, + "num_tokens": 707427109.0, + "step": 18539 + }, + { + "epoch": 2.3584785650680575, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8839119672775269, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8670558333396912, + "num_tokens": 707462502.0, + "step": 18540 + }, + { + "epoch": 2.358605775346648, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.6271073818206787, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8601765036582947, + "num_tokens": 707502861.0, + "step": 18541 + }, + { + "epoch": 2.3587329856252386, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0630908012390137, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8451733589172363, + "num_tokens": 707535868.0, + "step": 18542 + }, + { + "epoch": 2.358860195903829, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.042771339416504, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8687529563903809, + "num_tokens": 707566128.0, + "step": 18543 + }, + { + "epoch": 2.3589874061824196, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.865922212600708, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8729141354560852, + "num_tokens": 707605113.0, + "step": 18544 + }, + { + "epoch": 2.35911461646101, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.118891477584839, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8630983829498291, + "num_tokens": 707647968.0, + "step": 18545 + }, + { + "epoch": 2.3592418267396007, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9530686140060425, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.874722957611084, + "num_tokens": 707682183.0, + "step": 18546 + }, + { + "epoch": 2.3593690370181912, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9875941276550293, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8735437393188477, + "num_tokens": 707720938.0, + "step": 18547 + }, + { + "epoch": 2.3594962472967818, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9200286865234375, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.855054497718811, + "num_tokens": 707762624.0, + "step": 18548 + }, + { + "epoch": 2.3596234575753723, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9288462400436401, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8577638268470764, + "num_tokens": 707798406.0, + "step": 18549 + }, + { + "epoch": 2.3597506678539624, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9105933904647827, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8665945529937744, + "num_tokens": 707835935.0, + "step": 18550 + }, + { + "epoch": 2.3598778781325533, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8798437118530273, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8778601884841919, + "num_tokens": 707872971.0, + "step": 18551 + }, + { + "epoch": 2.3600050884111434, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9109967947006226, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8763847947120667, + "num_tokens": 707909798.0, + "step": 18552 + }, + { + "epoch": 2.360132298689734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8884146213531494, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8698320388793945, + "num_tokens": 707951130.0, + "step": 18553 + }, + { + "epoch": 2.3602595089683245, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.905441164970398, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8523215651512146, + "num_tokens": 707989502.0, + "step": 18554 + }, + { + "epoch": 2.360386719246915, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.024113893508911, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8656644821166992, + "num_tokens": 708023521.0, + "step": 18555 + }, + { + "epoch": 2.3605139295255055, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.033393144607544, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8714776039123535, + "num_tokens": 708057594.0, + "step": 18556 + }, + { + "epoch": 2.360641139804096, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8465197086334229, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8633100986480713, + "num_tokens": 708101393.0, + "step": 18557 + }, + { + "epoch": 2.3607683500826866, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.954735279083252, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.875989556312561, + "num_tokens": 708137141.0, + "step": 18558 + }, + { + "epoch": 2.360895560361277, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 7.760703086853027, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8710110783576965, + "num_tokens": 708173497.0, + "step": 18559 + }, + { + "epoch": 2.3610227706398677, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.255626678466797, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8662993907928467, + "num_tokens": 708209412.0, + "step": 18560 + }, + { + "epoch": 2.361149980918458, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.379582643508911, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8735676407814026, + "num_tokens": 708239675.0, + "step": 18561 + }, + { + "epoch": 2.3612771911970487, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.097701072692871, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8601018190383911, + "num_tokens": 708274918.0, + "step": 18562 + }, + { + "epoch": 2.3614044014756392, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0241804122924805, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8518195152282715, + "num_tokens": 708313645.0, + "step": 18563 + }, + { + "epoch": 2.3615316117542298, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7134613990783691, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8680614233016968, + "num_tokens": 708357985.0, + "step": 18564 + }, + { + "epoch": 2.3616588220328203, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9528528451919556, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8432967662811279, + "num_tokens": 708393606.0, + "step": 18565 + }, + { + "epoch": 2.361786032311411, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.955209732055664, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8722538948059082, + "num_tokens": 708428941.0, + "step": 18566 + }, + { + "epoch": 2.3619132425900013, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.6964664459228516, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8699517250061035, + "num_tokens": 708467847.0, + "step": 18567 + }, + { + "epoch": 2.362040452868592, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0641727447509766, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8831210136413574, + "num_tokens": 708503212.0, + "step": 18568 + }, + { + "epoch": 2.3621676631471824, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 6.426299571990967, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8656663298606873, + "num_tokens": 708542897.0, + "step": 18569 + }, + { + "epoch": 2.362294873425773, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.324744939804077, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8530575633049011, + "num_tokens": 708574725.0, + "step": 18570 + }, + { + "epoch": 2.3624220837043635, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8750219345092773, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8759990930557251, + "num_tokens": 708613557.0, + "step": 18571 + }, + { + "epoch": 2.362549293982954, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8771591186523438, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8584011197090149, + "num_tokens": 708651872.0, + "step": 18572 + }, + { + "epoch": 2.3626765042615445, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.838975191116333, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8526469469070435, + "num_tokens": 708688654.0, + "step": 18573 + }, + { + "epoch": 2.362803714540135, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9426263570785522, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.872306227684021, + "num_tokens": 708723285.0, + "step": 18574 + }, + { + "epoch": 2.362930924818725, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9650321006774902, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8746771812438965, + "num_tokens": 708760617.0, + "step": 18575 + }, + { + "epoch": 2.363058135097316, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1393322944641113, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8702608942985535, + "num_tokens": 708790792.0, + "step": 18576 + }, + { + "epoch": 2.363185345375906, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9470134973526, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8563270568847656, + "num_tokens": 708829935.0, + "step": 18577 + }, + { + "epoch": 2.3633125556544967, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.042285203933716, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8592706322669983, + "num_tokens": 708870578.0, + "step": 18578 + }, + { + "epoch": 2.3634397659330872, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7987430095672607, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8655832409858704, + "num_tokens": 708907044.0, + "step": 18579 + }, + { + "epoch": 2.3635669762116778, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.841988444328308, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.848240852355957, + "num_tokens": 708947443.0, + "step": 18580 + }, + { + "epoch": 2.3636941864902683, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.024730920791626, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8670535683631897, + "num_tokens": 708983174.0, + "step": 18581 + }, + { + "epoch": 2.363821396768859, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8438044786453247, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8692727088928223, + "num_tokens": 709021950.0, + "step": 18582 + }, + { + "epoch": 2.3639486070474494, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0073401927948, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8715908527374268, + "num_tokens": 709057236.0, + "step": 18583 + }, + { + "epoch": 2.36407581732604, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.856111764907837, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8673291802406311, + "num_tokens": 709093376.0, + "step": 18584 + }, + { + "epoch": 2.3642030276046304, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.049288749694824, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.86125648021698, + "num_tokens": 709129502.0, + "step": 18585 + }, + { + "epoch": 2.364330237883221, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.987655758857727, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8635118007659912, + "num_tokens": 709162538.0, + "step": 18586 + }, + { + "epoch": 2.3644574481618115, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.094446897506714, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8540579080581665, + "num_tokens": 709201690.0, + "step": 18587 + }, + { + "epoch": 2.364584658440402, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9861888885498047, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8584203720092773, + "num_tokens": 709240593.0, + "step": 18588 + }, + { + "epoch": 2.3647118687189925, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8324686288833618, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8772940039634705, + "num_tokens": 709278192.0, + "step": 18589 + }, + { + "epoch": 2.364839078997583, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9954146146774292, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8843916654586792, + "num_tokens": 709311797.0, + "step": 18590 + }, + { + "epoch": 2.3649662892761736, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.88316011428833, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8642993569374084, + "num_tokens": 709350379.0, + "step": 18591 + }, + { + "epoch": 2.365093499554764, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.828432559967041, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8913103342056274, + "num_tokens": 709389030.0, + "step": 18592 + }, + { + "epoch": 2.3652207098333546, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8708617687225342, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8584237098693848, + "num_tokens": 709433692.0, + "step": 18593 + }, + { + "epoch": 2.365347920111945, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.138153314590454, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8658567667007446, + "num_tokens": 709470773.0, + "step": 18594 + }, + { + "epoch": 2.3654751303905357, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8315397500991821, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8607399463653564, + "num_tokens": 709506764.0, + "step": 18595 + }, + { + "epoch": 2.3656023406691262, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8099614381790161, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8629031181335449, + "num_tokens": 709546931.0, + "step": 18596 + }, + { + "epoch": 2.3657295509477168, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9774528741836548, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8659846186637878, + "num_tokens": 709582842.0, + "step": 18597 + }, + { + "epoch": 2.365856761226307, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.896541714668274, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8661565184593201, + "num_tokens": 709621574.0, + "step": 18598 + }, + { + "epoch": 2.365983971504898, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8075894117355347, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8835170269012451, + "num_tokens": 709662437.0, + "step": 18599 + }, + { + "epoch": 2.366111181783488, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6766763925552368, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8797680735588074, + "num_tokens": 709701318.0, + "step": 18600 + }, + { + "epoch": 2.3662383920620784, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7849581241607666, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8661039471626282, + "num_tokens": 709745156.0, + "step": 18601 + }, + { + "epoch": 2.366365602340669, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.828608751296997, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8606719970703125, + "num_tokens": 709785849.0, + "step": 18602 + }, + { + "epoch": 2.3664928126192595, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9549126625061035, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8701595664024353, + "num_tokens": 709830038.0, + "step": 18603 + }, + { + "epoch": 2.36662002289785, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9271910190582275, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8631047010421753, + "num_tokens": 709871174.0, + "step": 18604 + }, + { + "epoch": 2.3667472331764405, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8971149921417236, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8870567083358765, + "num_tokens": 709905921.0, + "step": 18605 + }, + { + "epoch": 2.366874443455031, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8733546733856201, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8810741901397705, + "num_tokens": 709939856.0, + "step": 18606 + }, + { + "epoch": 2.3670016537336216, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.807094693183899, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8793478012084961, + "num_tokens": 709978805.0, + "step": 18607 + }, + { + "epoch": 2.367128864012212, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8225427865982056, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8797175884246826, + "num_tokens": 710015015.0, + "step": 18608 + }, + { + "epoch": 2.3672560742908026, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8043043613433838, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8578940033912659, + "num_tokens": 710054887.0, + "step": 18609 + }, + { + "epoch": 2.367383284569393, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.050630807876587, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8663519620895386, + "num_tokens": 710097012.0, + "step": 18610 + }, + { + "epoch": 2.3675104948479837, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.099722385406494, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8493943214416504, + "num_tokens": 710130912.0, + "step": 18611 + }, + { + "epoch": 2.3676377051265742, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9665772914886475, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8635310530662537, + "num_tokens": 710168880.0, + "step": 18612 + }, + { + "epoch": 2.3677649154051648, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0016400814056396, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8761348724365234, + "num_tokens": 710205619.0, + "step": 18613 + }, + { + "epoch": 2.3678921256837553, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0889899730682373, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8726624846458435, + "num_tokens": 710239475.0, + "step": 18614 + }, + { + "epoch": 2.368019335962346, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.912553310394287, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8688833713531494, + "num_tokens": 710276533.0, + "step": 18615 + }, + { + "epoch": 2.3681465462409363, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8458071947097778, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8817662000656128, + "num_tokens": 710316731.0, + "step": 18616 + }, + { + "epoch": 2.368273756519527, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.018994092941284, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8612145185470581, + "num_tokens": 710352310.0, + "step": 18617 + }, + { + "epoch": 2.3684009667981174, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7772823572158813, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8625075817108154, + "num_tokens": 710400203.0, + "step": 18618 + }, + { + "epoch": 2.368528177076708, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1795878410339355, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.872795581817627, + "num_tokens": 710433054.0, + "step": 18619 + }, + { + "epoch": 2.3686553873552985, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9970625638961792, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8697215914726257, + "num_tokens": 710466879.0, + "step": 18620 + }, + { + "epoch": 2.368782597633889, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0461230278015137, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8601698875427246, + "num_tokens": 710502808.0, + "step": 18621 + }, + { + "epoch": 2.3689098079124795, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8400318622589111, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8547986745834351, + "num_tokens": 710546190.0, + "step": 18622 + }, + { + "epoch": 2.3690370181910696, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8493624925613403, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.868105947971344, + "num_tokens": 710585556.0, + "step": 18623 + }, + { + "epoch": 2.3691642284696606, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9039112329483032, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8704535961151123, + "num_tokens": 710621366.0, + "step": 18624 + }, + { + "epoch": 2.3692914387482507, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9581531286239624, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8711687922477722, + "num_tokens": 710660217.0, + "step": 18625 + }, + { + "epoch": 2.369418649026841, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1646487712860107, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.863248884677887, + "num_tokens": 710695798.0, + "step": 18626 + }, + { + "epoch": 2.3695458593054317, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.070760726928711, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8491670489311218, + "num_tokens": 710732739.0, + "step": 18627 + }, + { + "epoch": 2.3696730695840222, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9649685621261597, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8567520380020142, + "num_tokens": 710769651.0, + "step": 18628 + }, + { + "epoch": 2.3698002798626128, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7405515909194946, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.86506587266922, + "num_tokens": 710809487.0, + "step": 18629 + }, + { + "epoch": 2.3699274901412033, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8944352865219116, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.870407223701477, + "num_tokens": 710849152.0, + "step": 18630 + }, + { + "epoch": 2.370054700419794, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8581678867340088, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8519289493560791, + "num_tokens": 710890933.0, + "step": 18631 + }, + { + "epoch": 2.3701819106983844, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9593236446380615, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8697749376296997, + "num_tokens": 710928262.0, + "step": 18632 + }, + { + "epoch": 2.370309120976975, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.853309988975525, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8717885613441467, + "num_tokens": 710962969.0, + "step": 18633 + }, + { + "epoch": 2.3704363312555654, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.00498104095459, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8624705672264099, + "num_tokens": 710998717.0, + "step": 18634 + }, + { + "epoch": 2.370563541534156, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.034486770629883, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8518245220184326, + "num_tokens": 711032819.0, + "step": 18635 + }, + { + "epoch": 2.3706907518127465, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9806400537490845, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8567537069320679, + "num_tokens": 711069395.0, + "step": 18636 + }, + { + "epoch": 2.370817962091337, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9679286479949951, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8679131269454956, + "num_tokens": 711105754.0, + "step": 18637 + }, + { + "epoch": 2.3709451723699275, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.954205870628357, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8609700202941895, + "num_tokens": 711145291.0, + "step": 18638 + }, + { + "epoch": 2.371072382648518, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0413246154785156, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.86809903383255, + "num_tokens": 711183431.0, + "step": 18639 + }, + { + "epoch": 2.3711995929271086, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.987748384475708, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8513385653495789, + "num_tokens": 711229754.0, + "step": 18640 + }, + { + "epoch": 2.371326803205699, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.727030634880066, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8674144744873047, + "num_tokens": 711269071.0, + "step": 18641 + }, + { + "epoch": 2.3714540134842896, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8402379751205444, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8630856275558472, + "num_tokens": 711312036.0, + "step": 18642 + }, + { + "epoch": 2.37158122376288, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3261044025421143, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.87408447265625, + "num_tokens": 711347945.0, + "step": 18643 + }, + { + "epoch": 2.3717084340414707, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0205929279327393, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8438825607299805, + "num_tokens": 711388433.0, + "step": 18644 + }, + { + "epoch": 2.371835644320061, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8645055294036865, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8709405660629272, + "num_tokens": 711425821.0, + "step": 18645 + }, + { + "epoch": 2.3719628545986517, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.87175714969635, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8597447872161865, + "num_tokens": 711462725.0, + "step": 18646 + }, + { + "epoch": 2.3720900648772423, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.864546537399292, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8805844783782959, + "num_tokens": 711499987.0, + "step": 18647 + }, + { + "epoch": 2.3722172751558324, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8357833623886108, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8499644994735718, + "num_tokens": 711543658.0, + "step": 18648 + }, + { + "epoch": 2.3723444854344233, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9369876384735107, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.877199649810791, + "num_tokens": 711583875.0, + "step": 18649 + }, + { + "epoch": 2.3724716957130134, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.329683303833008, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8480468392372131, + "num_tokens": 711625543.0, + "step": 18650 + }, + { + "epoch": 2.372598905991604, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9198590517044067, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8662195801734924, + "num_tokens": 711660528.0, + "step": 18651 + }, + { + "epoch": 2.3727261162701945, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9391179084777832, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8815865516662598, + "num_tokens": 711695646.0, + "step": 18652 + }, + { + "epoch": 2.372853326548785, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9344698190689087, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8502522706985474, + "num_tokens": 711734691.0, + "step": 18653 + }, + { + "epoch": 2.3729805368273755, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0419514179229736, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8679483532905579, + "num_tokens": 711771518.0, + "step": 18654 + }, + { + "epoch": 2.373107747105966, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0382275581359863, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8557083606719971, + "num_tokens": 711808968.0, + "step": 18655 + }, + { + "epoch": 2.3732349573845566, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9795626401901245, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.873249888420105, + "num_tokens": 711842944.0, + "step": 18656 + }, + { + "epoch": 2.373362167663147, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.106858253479004, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8627616763114929, + "num_tokens": 711882740.0, + "step": 18657 + }, + { + "epoch": 2.3734893779417376, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8476104736328125, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8596078157424927, + "num_tokens": 711925362.0, + "step": 18658 + }, + { + "epoch": 2.373616588220328, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8914000988006592, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8710062503814697, + "num_tokens": 711971285.0, + "step": 18659 + }, + { + "epoch": 2.3737437984989187, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9464865922927856, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8618825674057007, + "num_tokens": 712013306.0, + "step": 18660 + }, + { + "epoch": 2.3738710087775092, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9259923696517944, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.866344153881073, + "num_tokens": 712050837.0, + "step": 18661 + }, + { + "epoch": 2.3739982190560998, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.996150016784668, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8822771906852722, + "num_tokens": 712085140.0, + "step": 18662 + }, + { + "epoch": 2.3741254293346903, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.780167579650879, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8801928758621216, + "num_tokens": 712122495.0, + "step": 18663 + }, + { + "epoch": 2.374252639613281, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9178214073181152, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8589637279510498, + "num_tokens": 712161312.0, + "step": 18664 + }, + { + "epoch": 2.3743798498918713, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9205896854400635, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8635202050209045, + "num_tokens": 712197275.0, + "step": 18665 + }, + { + "epoch": 2.374507060170462, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9948912858963013, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8538779020309448, + "num_tokens": 712231309.0, + "step": 18666 + }, + { + "epoch": 2.3746342704490524, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1178078651428223, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8607255816459656, + "num_tokens": 712264864.0, + "step": 18667 + }, + { + "epoch": 2.374761480727643, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2440576553344727, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8668601512908936, + "num_tokens": 712303564.0, + "step": 18668 + }, + { + "epoch": 2.3748886910062335, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0169100761413574, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8756920099258423, + "num_tokens": 712341872.0, + "step": 18669 + }, + { + "epoch": 2.375015901284824, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0559866428375244, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.853942334651947, + "num_tokens": 712379835.0, + "step": 18670 + }, + { + "epoch": 2.3751431115634145, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.031028985977173, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8621318340301514, + "num_tokens": 712417462.0, + "step": 18671 + }, + { + "epoch": 2.375270321842005, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9536018371582031, + "learning_rate": 1e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8395687341690063, + "num_tokens": 712461581.0, + "step": 18672 + }, + { + "epoch": 2.375397532120595, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0415658950805664, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8638121485710144, + "num_tokens": 712499615.0, + "step": 18673 + }, + { + "epoch": 2.375524742399186, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9966456890106201, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8760631084442139, + "num_tokens": 712533264.0, + "step": 18674 + }, + { + "epoch": 2.375651952677776, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8557039499282837, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8585159182548523, + "num_tokens": 712571496.0, + "step": 18675 + }, + { + "epoch": 2.3757791629563667, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9694100618362427, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8718215227127075, + "num_tokens": 712606510.0, + "step": 18676 + }, + { + "epoch": 2.3759063732349572, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8376716375350952, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8741737008094788, + "num_tokens": 712646218.0, + "step": 18677 + }, + { + "epoch": 2.3760335835135478, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0507047176361084, + "learning_rate": 1e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.833539605140686, + "num_tokens": 712681905.0, + "step": 18678 + }, + { + "epoch": 2.3761607937921383, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8853962421417236, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8783370852470398, + "num_tokens": 712719916.0, + "step": 18679 + }, + { + "epoch": 2.376288004070729, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7971606254577637, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.866601824760437, + "num_tokens": 712760849.0, + "step": 18680 + }, + { + "epoch": 2.3764152143493193, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.786823034286499, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8769601583480835, + "num_tokens": 712801199.0, + "step": 18681 + }, + { + "epoch": 2.37654242462791, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8827095031738281, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8589061498641968, + "num_tokens": 712843471.0, + "step": 18682 + }, + { + "epoch": 2.3766696349065004, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1833083629608154, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8658577799797058, + "num_tokens": 712880649.0, + "step": 18683 + }, + { + "epoch": 2.376796845185091, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8470760583877563, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8426096439361572, + "num_tokens": 712920574.0, + "step": 18684 + }, + { + "epoch": 2.3769240554636815, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7720593214035034, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8756245970726013, + "num_tokens": 712962544.0, + "step": 18685 + }, + { + "epoch": 2.377051265742272, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.937548041343689, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8696397542953491, + "num_tokens": 712999138.0, + "step": 18686 + }, + { + "epoch": 2.3771784760208625, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8443161249160767, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8665071725845337, + "num_tokens": 713041799.0, + "step": 18687 + }, + { + "epoch": 2.377305686299453, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7269964218139648, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8593993782997131, + "num_tokens": 713088248.0, + "step": 18688 + }, + { + "epoch": 2.3774328965780436, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0029571056365967, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8738687634468079, + "num_tokens": 713127239.0, + "step": 18689 + }, + { + "epoch": 2.377560106856634, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9022774696350098, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8692374229431152, + "num_tokens": 713168389.0, + "step": 18690 + }, + { + "epoch": 2.3776873171352246, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1542980670928955, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8588510155677795, + "num_tokens": 713208137.0, + "step": 18691 + }, + { + "epoch": 2.377814527413815, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.108118772506714, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8555204272270203, + "num_tokens": 713249344.0, + "step": 18692 + }, + { + "epoch": 2.3779417376924057, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.04006028175354, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8625596761703491, + "num_tokens": 713291376.0, + "step": 18693 + }, + { + "epoch": 2.378068947970996, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9336930513381958, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8641684055328369, + "num_tokens": 713329925.0, + "step": 18694 + }, + { + "epoch": 2.3781961582495867, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8704394102096558, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8683977723121643, + "num_tokens": 713372509.0, + "step": 18695 + }, + { + "epoch": 2.378323368528177, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9915704727172852, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8592181205749512, + "num_tokens": 713410055.0, + "step": 18696 + }, + { + "epoch": 2.378450578806768, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7972365617752075, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8622187376022339, + "num_tokens": 713453958.0, + "step": 18697 + }, + { + "epoch": 2.378577789085358, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9751497507095337, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.864582896232605, + "num_tokens": 713485683.0, + "step": 18698 + }, + { + "epoch": 2.3787049993639484, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7431232929229736, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8664604425430298, + "num_tokens": 713528565.0, + "step": 18699 + }, + { + "epoch": 2.378832209642539, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0011489391326904, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8612545728683472, + "num_tokens": 713560932.0, + "step": 18700 + }, + { + "epoch": 2.3789594199211295, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.123854160308838, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8621620535850525, + "num_tokens": 713592891.0, + "step": 18701 + }, + { + "epoch": 2.37908663019972, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8666967153549194, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8628414869308472, + "num_tokens": 713631285.0, + "step": 18702 + }, + { + "epoch": 2.3792138404783105, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.807497262954712, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8478212356567383, + "num_tokens": 713675143.0, + "step": 18703 + }, + { + "epoch": 2.379341050756901, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8417527675628662, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8708349466323853, + "num_tokens": 713717556.0, + "step": 18704 + }, + { + "epoch": 2.3794682610354916, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9968585968017578, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8568812608718872, + "num_tokens": 713763515.0, + "step": 18705 + }, + { + "epoch": 2.379595471314082, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0095813274383545, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8707331418991089, + "num_tokens": 713801064.0, + "step": 18706 + }, + { + "epoch": 2.3797226815926726, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9131122827529907, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8611437082290649, + "num_tokens": 713841662.0, + "step": 18707 + }, + { + "epoch": 2.379849891871263, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8829965591430664, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.868553876876831, + "num_tokens": 713883646.0, + "step": 18708 + }, + { + "epoch": 2.3799771021498537, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.058689594268799, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8859102129936218, + "num_tokens": 713925365.0, + "step": 18709 + }, + { + "epoch": 2.3801043124284442, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9594672918319702, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8621945381164551, + "num_tokens": 713961142.0, + "step": 18710 + }, + { + "epoch": 2.3802315227070348, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8639339208602905, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8728176355361938, + "num_tokens": 714001652.0, + "step": 18711 + }, + { + "epoch": 2.3803587329856253, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1151840686798096, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8680800199508667, + "num_tokens": 714043023.0, + "step": 18712 + }, + { + "epoch": 2.380485943264216, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.939664602279663, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8640103340148926, + "num_tokens": 714080389.0, + "step": 18713 + }, + { + "epoch": 2.3806131535428063, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.906912922859192, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8699837923049927, + "num_tokens": 714117695.0, + "step": 18714 + }, + { + "epoch": 2.380740363821397, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0899901390075684, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8726063966751099, + "num_tokens": 714154204.0, + "step": 18715 + }, + { + "epoch": 2.3808675740999874, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8547208309173584, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8486584424972534, + "num_tokens": 714194857.0, + "step": 18716 + }, + { + "epoch": 2.380994784378578, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9454728364944458, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8781601190567017, + "num_tokens": 714228034.0, + "step": 18717 + }, + { + "epoch": 2.3811219946571685, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8518167734146118, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8715698719024658, + "num_tokens": 714269688.0, + "step": 18718 + }, + { + "epoch": 2.381249204935759, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.050042152404785, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8593469858169556, + "num_tokens": 714309039.0, + "step": 18719 + }, + { + "epoch": 2.3813764152143495, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8511375188827515, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8715570569038391, + "num_tokens": 714347453.0, + "step": 18720 + }, + { + "epoch": 2.3815036254929396, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8085618019104004, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8630224466323853, + "num_tokens": 714385850.0, + "step": 18721 + }, + { + "epoch": 2.3816308357715306, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7393014430999756, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8608208894729614, + "num_tokens": 714426735.0, + "step": 18722 + }, + { + "epoch": 2.3817580460501206, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8299522399902344, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.877443790435791, + "num_tokens": 714465279.0, + "step": 18723 + }, + { + "epoch": 2.381885256328711, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9460790157318115, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8532609343528748, + "num_tokens": 714505703.0, + "step": 18724 + }, + { + "epoch": 2.3820124666073017, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.918442726135254, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8605911135673523, + "num_tokens": 714543642.0, + "step": 18725 + }, + { + "epoch": 2.3821396768858922, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8019771575927734, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8819654583930969, + "num_tokens": 714583533.0, + "step": 18726 + }, + { + "epoch": 2.3822668871644828, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8475127220153809, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.870086133480072, + "num_tokens": 714625577.0, + "step": 18727 + }, + { + "epoch": 2.3823940974430733, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9437650442123413, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8607850670814514, + "num_tokens": 714664987.0, + "step": 18728 + }, + { + "epoch": 2.382521307721664, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9214348793029785, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8606439828872681, + "num_tokens": 714705881.0, + "step": 18729 + }, + { + "epoch": 2.3826485180002543, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9049036502838135, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8565603494644165, + "num_tokens": 714746012.0, + "step": 18730 + }, + { + "epoch": 2.382775728278845, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.770799994468689, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8675283789634705, + "num_tokens": 714790001.0, + "step": 18731 + }, + { + "epoch": 2.3829029385574354, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9329490661621094, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8731310367584229, + "num_tokens": 714824515.0, + "step": 18732 + }, + { + "epoch": 2.383030148836026, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9132320880889893, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8581954836845398, + "num_tokens": 714860841.0, + "step": 18733 + }, + { + "epoch": 2.3831573591146165, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.875458002090454, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8682459592819214, + "num_tokens": 714896373.0, + "step": 18734 + }, + { + "epoch": 2.383284569393207, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8935190439224243, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8688040971755981, + "num_tokens": 714926684.0, + "step": 18735 + }, + { + "epoch": 2.3834117796717975, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.971783995628357, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8809511065483093, + "num_tokens": 714961057.0, + "step": 18736 + }, + { + "epoch": 2.383538989950388, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.849217176437378, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8717701435089111, + "num_tokens": 715001215.0, + "step": 18737 + }, + { + "epoch": 2.3836662002289786, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.062406539916992, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.845360279083252, + "num_tokens": 715038252.0, + "step": 18738 + }, + { + "epoch": 2.383793410507569, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9797797203063965, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8647711873054504, + "num_tokens": 715078320.0, + "step": 18739 + }, + { + "epoch": 2.3839206207861596, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9140480756759644, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8596177101135254, + "num_tokens": 715115628.0, + "step": 18740 + }, + { + "epoch": 2.38404783106475, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.924018383026123, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8735716342926025, + "num_tokens": 715153294.0, + "step": 18741 + }, + { + "epoch": 2.3841750413433407, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8905085325241089, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8671437501907349, + "num_tokens": 715193736.0, + "step": 18742 + }, + { + "epoch": 2.384302251621931, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9138292074203491, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8673508763313293, + "num_tokens": 715234539.0, + "step": 18743 + }, + { + "epoch": 2.3844294619005217, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7810112237930298, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8788424730300903, + "num_tokens": 715271567.0, + "step": 18744 + }, + { + "epoch": 2.3845566721791123, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6910368204116821, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8781920671463013, + "num_tokens": 715314850.0, + "step": 18745 + }, + { + "epoch": 2.3846838824577024, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9688682556152344, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8715248703956604, + "num_tokens": 715353353.0, + "step": 18746 + }, + { + "epoch": 2.3848110927362933, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1339778900146484, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8755967617034912, + "num_tokens": 715387843.0, + "step": 18747 + }, + { + "epoch": 2.3849383030148834, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9629207849502563, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8523762226104736, + "num_tokens": 715428037.0, + "step": 18748 + }, + { + "epoch": 2.385065513293474, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1401524543762207, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8799862861633301, + "num_tokens": 715464403.0, + "step": 18749 + }, + { + "epoch": 2.3851927235720645, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.817297101020813, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8760010600090027, + "num_tokens": 715501820.0, + "step": 18750 + }, + { + "epoch": 2.385319933850655, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8005523681640625, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.877028226852417, + "num_tokens": 715543287.0, + "step": 18751 + }, + { + "epoch": 2.3854471441292455, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.077101469039917, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8759340047836304, + "num_tokens": 715578189.0, + "step": 18752 + }, + { + "epoch": 2.385574354407836, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0155704021453857, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8563258647918701, + "num_tokens": 715616447.0, + "step": 18753 + }, + { + "epoch": 2.3857015646864266, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9081321954727173, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.870090663433075, + "num_tokens": 715651692.0, + "step": 18754 + }, + { + "epoch": 2.385828774965017, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.293483018875122, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8571792840957642, + "num_tokens": 715687733.0, + "step": 18755 + }, + { + "epoch": 2.3859559852436076, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.883491039276123, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8432660698890686, + "num_tokens": 715727481.0, + "step": 18756 + }, + { + "epoch": 2.386083195522198, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9341951608657837, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.848863959312439, + "num_tokens": 715767549.0, + "step": 18757 + }, + { + "epoch": 2.3862104058007887, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9555158615112305, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8588965535163879, + "num_tokens": 715807251.0, + "step": 18758 + }, + { + "epoch": 2.386337616079379, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.931098222732544, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8509883284568787, + "num_tokens": 715846893.0, + "step": 18759 + }, + { + "epoch": 2.3864648263579697, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0471506118774414, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8518495559692383, + "num_tokens": 715883611.0, + "step": 18760 + }, + { + "epoch": 2.3865920366365603, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0855700969696045, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8672773241996765, + "num_tokens": 715915598.0, + "step": 18761 + }, + { + "epoch": 2.386719246915151, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8789056539535522, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8726383447647095, + "num_tokens": 715956647.0, + "step": 18762 + }, + { + "epoch": 2.3868464571937413, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8909809589385986, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8726527690887451, + "num_tokens": 715996627.0, + "step": 18763 + }, + { + "epoch": 2.386973667472332, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8693592548370361, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8867884874343872, + "num_tokens": 716027823.0, + "step": 18764 + }, + { + "epoch": 2.3871008777509224, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9546799659729004, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8525654077529907, + "num_tokens": 716067430.0, + "step": 18765 + }, + { + "epoch": 2.387228088029513, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8716191053390503, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8583181500434875, + "num_tokens": 716109436.0, + "step": 18766 + }, + { + "epoch": 2.3873552983081034, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9115471839904785, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8611224293708801, + "num_tokens": 716148157.0, + "step": 18767 + }, + { + "epoch": 2.387482508586694, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2964425086975098, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8685872554779053, + "num_tokens": 716184662.0, + "step": 18768 + }, + { + "epoch": 2.3876097188652845, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8552637100219727, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8588666915893555, + "num_tokens": 716227382.0, + "step": 18769 + }, + { + "epoch": 2.387736929143875, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9783263206481934, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.869147002696991, + "num_tokens": 716262236.0, + "step": 18770 + }, + { + "epoch": 2.387864139422465, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.036126136779785, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8751626014709473, + "num_tokens": 716295541.0, + "step": 18771 + }, + { + "epoch": 2.387991349701056, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.015228271484375, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8684819936752319, + "num_tokens": 716331796.0, + "step": 18772 + }, + { + "epoch": 2.388118559979646, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0153088569641113, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8568552732467651, + "num_tokens": 716369539.0, + "step": 18773 + }, + { + "epoch": 2.3882457702582367, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8279355764389038, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8823175430297852, + "num_tokens": 716409806.0, + "step": 18774 + }, + { + "epoch": 2.3883729805368272, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8725532293319702, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8579432964324951, + "num_tokens": 716448191.0, + "step": 18775 + }, + { + "epoch": 2.3885001908154178, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8539689779281616, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8671799898147583, + "num_tokens": 716486522.0, + "step": 18776 + }, + { + "epoch": 2.3886274010940083, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9437812566757202, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.866192638874054, + "num_tokens": 716519383.0, + "step": 18777 + }, + { + "epoch": 2.388754611372599, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9842697381973267, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8682140707969666, + "num_tokens": 716556136.0, + "step": 18778 + }, + { + "epoch": 2.3888818216511893, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2719852924346924, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8588557243347168, + "num_tokens": 716592346.0, + "step": 18779 + }, + { + "epoch": 2.38900903192978, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9133344888687134, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.859186589717865, + "num_tokens": 716633996.0, + "step": 18780 + }, + { + "epoch": 2.3891362422083704, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0725600719451904, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8608843684196472, + "num_tokens": 716669638.0, + "step": 18781 + }, + { + "epoch": 2.389263452486961, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.980385422706604, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8562047481536865, + "num_tokens": 716711194.0, + "step": 18782 + }, + { + "epoch": 2.3893906627655515, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8020973205566406, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8726516366004944, + "num_tokens": 716753402.0, + "step": 18783 + }, + { + "epoch": 2.389517873044142, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.829127311706543, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8680844306945801, + "num_tokens": 716794638.0, + "step": 18784 + }, + { + "epoch": 2.3896450833227325, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9593846797943115, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8368155360221863, + "num_tokens": 716834003.0, + "step": 18785 + }, + { + "epoch": 2.389772293601323, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8407145738601685, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8709912300109863, + "num_tokens": 716867349.0, + "step": 18786 + }, + { + "epoch": 2.3898995038799136, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1092488765716553, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8648699522018433, + "num_tokens": 716904271.0, + "step": 18787 + }, + { + "epoch": 2.390026714158504, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.177827835083008, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8673702478408813, + "num_tokens": 716936165.0, + "step": 18788 + }, + { + "epoch": 2.3901539244370946, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0690741539001465, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8650757074356079, + "num_tokens": 716974011.0, + "step": 18789 + }, + { + "epoch": 2.390281134715685, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9802589416503906, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8661683797836304, + "num_tokens": 717012438.0, + "step": 18790 + }, + { + "epoch": 2.3904083449942757, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.994969367980957, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8568985462188721, + "num_tokens": 717051755.0, + "step": 18791 + }, + { + "epoch": 2.390535555272866, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9185131788253784, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8657998442649841, + "num_tokens": 717090740.0, + "step": 18792 + }, + { + "epoch": 2.3906627655514567, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8252288103103638, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8387086987495422, + "num_tokens": 717132033.0, + "step": 18793 + }, + { + "epoch": 2.390789975830047, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.943363904953003, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8726601600646973, + "num_tokens": 717168255.0, + "step": 18794 + }, + { + "epoch": 2.390917186108638, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.030017375946045, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8630169630050659, + "num_tokens": 717206863.0, + "step": 18795 + }, + { + "epoch": 2.391044396387228, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.019996166229248, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8437708616256714, + "num_tokens": 717245374.0, + "step": 18796 + }, + { + "epoch": 2.3911716066658184, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8587981462478638, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8845572471618652, + "num_tokens": 717279821.0, + "step": 18797 + }, + { + "epoch": 2.391298816944409, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9259217977523804, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8575050234794617, + "num_tokens": 717318896.0, + "step": 18798 + }, + { + "epoch": 2.3914260272229995, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8591254949569702, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8566808104515076, + "num_tokens": 717361493.0, + "step": 18799 + }, + { + "epoch": 2.39155323750159, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8016936779022217, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8552696704864502, + "num_tokens": 717402801.0, + "step": 18800 + }, + { + "epoch": 2.3916804477801805, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8268120288848877, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.883594810962677, + "num_tokens": 717441275.0, + "step": 18801 + }, + { + "epoch": 2.391807658058771, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9113154411315918, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8531776070594788, + "num_tokens": 717477423.0, + "step": 18802 + }, + { + "epoch": 2.3919348683373616, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0054328441619873, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8578197360038757, + "num_tokens": 717512978.0, + "step": 18803 + }, + { + "epoch": 2.392062078615952, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.843159556388855, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.884809136390686, + "num_tokens": 717549710.0, + "step": 18804 + }, + { + "epoch": 2.3921892888945426, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9178658723831177, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8623741865158081, + "num_tokens": 717586588.0, + "step": 18805 + }, + { + "epoch": 2.392316499173133, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1225085258483887, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8780752420425415, + "num_tokens": 717621507.0, + "step": 18806 + }, + { + "epoch": 2.3924437094517237, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.951135516166687, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8657137155532837, + "num_tokens": 717655915.0, + "step": 18807 + }, + { + "epoch": 2.392570919730314, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9466724395751953, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8622004985809326, + "num_tokens": 717696357.0, + "step": 18808 + }, + { + "epoch": 2.3926981300089047, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0499863624572754, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8778352737426758, + "num_tokens": 717735397.0, + "step": 18809 + }, + { + "epoch": 2.3928253402874953, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8751558065414429, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8694877028465271, + "num_tokens": 717770081.0, + "step": 18810 + }, + { + "epoch": 2.392952550566086, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8681985139846802, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8813694715499878, + "num_tokens": 717805262.0, + "step": 18811 + }, + { + "epoch": 2.3930797608446763, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0159411430358887, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8649746775627136, + "num_tokens": 717841877.0, + "step": 18812 + }, + { + "epoch": 2.393206971123267, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9805357456207275, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8487508296966553, + "num_tokens": 717881435.0, + "step": 18813 + }, + { + "epoch": 2.3933341814018574, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 3.0759692192077637, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8855874538421631, + "num_tokens": 717922240.0, + "step": 18814 + }, + { + "epoch": 2.393461391680448, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9092844724655151, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8623340129852295, + "num_tokens": 717958438.0, + "step": 18815 + }, + { + "epoch": 2.3935886019590384, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0060555934906006, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8571059107780457, + "num_tokens": 717994451.0, + "step": 18816 + }, + { + "epoch": 2.393715812237629, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9521958827972412, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8682135939598083, + "num_tokens": 718034318.0, + "step": 18817 + }, + { + "epoch": 2.3938430225162195, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8048396110534668, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8659209609031677, + "num_tokens": 718074994.0, + "step": 18818 + }, + { + "epoch": 2.3939702327948096, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.905665397644043, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8617940545082092, + "num_tokens": 718118849.0, + "step": 18819 + }, + { + "epoch": 2.3940974430734006, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0038070678710938, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8717038631439209, + "num_tokens": 718152874.0, + "step": 18820 + }, + { + "epoch": 2.3942246533519906, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7858201265335083, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8591717481613159, + "num_tokens": 718194206.0, + "step": 18821 + }, + { + "epoch": 2.394351863630581, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.12418532371521, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8694682121276855, + "num_tokens": 718227408.0, + "step": 18822 + }, + { + "epoch": 2.3944790739091717, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0124213695526123, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8600876331329346, + "num_tokens": 718266962.0, + "step": 18823 + }, + { + "epoch": 2.3946062841877622, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.841667890548706, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8701181411743164, + "num_tokens": 718307873.0, + "step": 18824 + }, + { + "epoch": 2.3947334944663528, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6965088844299316, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8717167377471924, + "num_tokens": 718350296.0, + "step": 18825 + }, + { + "epoch": 2.3948607047449433, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9720147848129272, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8675506114959717, + "num_tokens": 718390734.0, + "step": 18826 + }, + { + "epoch": 2.394987915023534, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9270015954971313, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8807060718536377, + "num_tokens": 718435630.0, + "step": 18827 + }, + { + "epoch": 2.3951151253021243, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7719230651855469, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8721228241920471, + "num_tokens": 718474791.0, + "step": 18828 + }, + { + "epoch": 2.395242335580715, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9670490026474, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.847680926322937, + "num_tokens": 718514957.0, + "step": 18829 + }, + { + "epoch": 2.3953695458593054, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0719962120056152, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8773331642150879, + "num_tokens": 718551761.0, + "step": 18830 + }, + { + "epoch": 2.395496756137896, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0303397178649902, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8639668822288513, + "num_tokens": 718586445.0, + "step": 18831 + }, + { + "epoch": 2.3956239664164865, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7524160146713257, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.860871434211731, + "num_tokens": 718631105.0, + "step": 18832 + }, + { + "epoch": 2.395751176695077, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.885668396949768, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8616968393325806, + "num_tokens": 718672278.0, + "step": 18833 + }, + { + "epoch": 2.3958783869736675, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9120643138885498, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.871006965637207, + "num_tokens": 718713191.0, + "step": 18834 + }, + { + "epoch": 2.396005597252258, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0748815536499023, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8594291806221008, + "num_tokens": 718745012.0, + "step": 18835 + }, + { + "epoch": 2.3961328075308486, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8540114164352417, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8667949438095093, + "num_tokens": 718780149.0, + "step": 18836 + }, + { + "epoch": 2.396260017809439, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0750648975372314, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8605377078056335, + "num_tokens": 718812831.0, + "step": 18837 + }, + { + "epoch": 2.3963872280880296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8924002647399902, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8743347525596619, + "num_tokens": 718847650.0, + "step": 18838 + }, + { + "epoch": 2.39651443836662, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9648057222366333, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8672220706939697, + "num_tokens": 718883417.0, + "step": 18839 + }, + { + "epoch": 2.3966416486452107, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0055549144744873, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8915258646011353, + "num_tokens": 718915501.0, + "step": 18840 + }, + { + "epoch": 2.396768858923801, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9052542448043823, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8807812333106995, + "num_tokens": 718954485.0, + "step": 18841 + }, + { + "epoch": 2.3968960692023917, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9075584411621094, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8501273989677429, + "num_tokens": 718995815.0, + "step": 18842 + }, + { + "epoch": 2.3970232794809823, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7763419151306152, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8809345960617065, + "num_tokens": 719032820.0, + "step": 18843 + }, + { + "epoch": 2.3971504897595723, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.044898748397827, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8631036877632141, + "num_tokens": 719066966.0, + "step": 18844 + }, + { + "epoch": 2.3972777000381633, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9926093816757202, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.865297794342041, + "num_tokens": 719108768.0, + "step": 18845 + }, + { + "epoch": 2.3974049103167534, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7890576124191284, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.863328754901886, + "num_tokens": 719151451.0, + "step": 18846 + }, + { + "epoch": 2.397532120595344, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9546197652816772, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8580698370933533, + "num_tokens": 719192103.0, + "step": 18847 + }, + { + "epoch": 2.3976593308739345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1506271362304688, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8632735013961792, + "num_tokens": 719226514.0, + "step": 18848 + }, + { + "epoch": 2.397786541152525, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8103951215744019, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8901311159133911, + "num_tokens": 719262944.0, + "step": 18849 + }, + { + "epoch": 2.3979137514311155, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7258602380752563, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8725569844245911, + "num_tokens": 719306707.0, + "step": 18850 + }, + { + "epoch": 2.398040961709706, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9194666147232056, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8719265460968018, + "num_tokens": 719347523.0, + "step": 18851 + }, + { + "epoch": 2.3981681719882966, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0541188716888428, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8850383162498474, + "num_tokens": 719379474.0, + "step": 18852 + }, + { + "epoch": 2.398295382266887, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0991952419281006, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8499058485031128, + "num_tokens": 719421624.0, + "step": 18853 + }, + { + "epoch": 2.3984225925454776, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9920132160186768, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8619077205657959, + "num_tokens": 719455570.0, + "step": 18854 + }, + { + "epoch": 2.398549802824068, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6918977499008179, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8820211887359619, + "num_tokens": 719500300.0, + "step": 18855 + }, + { + "epoch": 2.3986770131026587, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.121872663497925, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8519694209098816, + "num_tokens": 719538119.0, + "step": 18856 + }, + { + "epoch": 2.398804223381249, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9263508319854736, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8552138209342957, + "num_tokens": 719574012.0, + "step": 18857 + }, + { + "epoch": 2.3989314336598397, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9639110565185547, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8785200119018555, + "num_tokens": 719609548.0, + "step": 18858 + }, + { + "epoch": 2.3990586439384303, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.930712103843689, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8602042198181152, + "num_tokens": 719647280.0, + "step": 18859 + }, + { + "epoch": 2.399185854217021, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.444484233856201, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8811548948287964, + "num_tokens": 719686987.0, + "step": 18860 + }, + { + "epoch": 2.3993130644956113, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8862864971160889, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8844967484474182, + "num_tokens": 719727974.0, + "step": 18861 + }, + { + "epoch": 2.399440274774202, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9405783414840698, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8651607036590576, + "num_tokens": 719764308.0, + "step": 18862 + }, + { + "epoch": 2.3995674850527924, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9164116382598877, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8545305728912354, + "num_tokens": 719801872.0, + "step": 18863 + }, + { + "epoch": 2.399694695331383, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.836558222770691, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8682627081871033, + "num_tokens": 719842504.0, + "step": 18864 + }, + { + "epoch": 2.3998219056099734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9849205017089844, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8619171380996704, + "num_tokens": 719878885.0, + "step": 18865 + }, + { + "epoch": 2.399949115888564, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9067621231079102, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8503424525260925, + "num_tokens": 719922891.0, + "step": 18866 + }, + { + "epoch": 2.4000763261671545, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8913689851760864, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8646672964096069, + "num_tokens": 719960113.0, + "step": 18867 + }, + { + "epoch": 2.400203536445745, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7011287212371826, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8654977083206177, + "num_tokens": 720006079.0, + "step": 18868 + }, + { + "epoch": 2.400330746724335, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9222034215927124, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8532428741455078, + "num_tokens": 720042219.0, + "step": 18869 + }, + { + "epoch": 2.400457957002926, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0390870571136475, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8667256236076355, + "num_tokens": 720082574.0, + "step": 18870 + }, + { + "epoch": 2.400585167281516, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9436264038085938, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8623424172401428, + "num_tokens": 720115583.0, + "step": 18871 + }, + { + "epoch": 2.4007123775601067, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9384095668792725, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.855685830116272, + "num_tokens": 720151350.0, + "step": 18872 + }, + { + "epoch": 2.400839587838697, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.958064317703247, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8569989800453186, + "num_tokens": 720187072.0, + "step": 18873 + }, + { + "epoch": 2.4009667981172877, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9792544841766357, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8690225481987, + "num_tokens": 720222249.0, + "step": 18874 + }, + { + "epoch": 2.4010940083958783, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0531225204467773, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8639426231384277, + "num_tokens": 720264682.0, + "step": 18875 + }, + { + "epoch": 2.401221218674469, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7833526134490967, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8630304336547852, + "num_tokens": 720309935.0, + "step": 18876 + }, + { + "epoch": 2.4013484289530593, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.620033025741577, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.858664870262146, + "num_tokens": 720343992.0, + "step": 18877 + }, + { + "epoch": 2.40147563923165, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.94178307056427, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8642096519470215, + "num_tokens": 720379707.0, + "step": 18878 + }, + { + "epoch": 2.4016028495102404, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.919793963432312, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8615233898162842, + "num_tokens": 720419796.0, + "step": 18879 + }, + { + "epoch": 2.401730059788831, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8496203422546387, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8681789040565491, + "num_tokens": 720460919.0, + "step": 18880 + }, + { + "epoch": 2.4018572700674214, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2329795360565186, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.861961841583252, + "num_tokens": 720498403.0, + "step": 18881 + }, + { + "epoch": 2.401984480346012, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9226796627044678, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8665578365325928, + "num_tokens": 720534931.0, + "step": 18882 + }, + { + "epoch": 2.4021116906246025, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2345921993255615, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8708081245422363, + "num_tokens": 720574062.0, + "step": 18883 + }, + { + "epoch": 2.402238900903193, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9177017211914062, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.858533501625061, + "num_tokens": 720611552.0, + "step": 18884 + }, + { + "epoch": 2.4023661111817836, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8439902067184448, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8636784553527832, + "num_tokens": 720647886.0, + "step": 18885 + }, + { + "epoch": 2.402493321460374, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7539198398590088, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.870404839515686, + "num_tokens": 720688034.0, + "step": 18886 + }, + { + "epoch": 2.4026205317389646, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6570146083831787, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8723508715629578, + "num_tokens": 720727805.0, + "step": 18887 + }, + { + "epoch": 2.402747742017555, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1252567768096924, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.868592381477356, + "num_tokens": 720759717.0, + "step": 18888 + }, + { + "epoch": 2.4028749522961457, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.03538179397583, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8590520024299622, + "num_tokens": 720790435.0, + "step": 18889 + }, + { + "epoch": 2.403002162574736, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.956008791923523, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8617207407951355, + "num_tokens": 720831715.0, + "step": 18890 + }, + { + "epoch": 2.4031293728533267, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.927205204963684, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8657094836235046, + "num_tokens": 720869598.0, + "step": 18891 + }, + { + "epoch": 2.403256583131917, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9620473384857178, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8599517941474915, + "num_tokens": 720911122.0, + "step": 18892 + }, + { + "epoch": 2.403383793410508, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7817713022232056, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8682694435119629, + "num_tokens": 720952265.0, + "step": 18893 + }, + { + "epoch": 2.403511003689098, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.950973391532898, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8691155910491943, + "num_tokens": 720991191.0, + "step": 18894 + }, + { + "epoch": 2.4036382139676884, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.880744218826294, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8756887316703796, + "num_tokens": 721032524.0, + "step": 18895 + }, + { + "epoch": 2.403765424246279, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9469053745269775, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8640092611312866, + "num_tokens": 721072699.0, + "step": 18896 + }, + { + "epoch": 2.4038926345248695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8117371797561646, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8776757717132568, + "num_tokens": 721109467.0, + "step": 18897 + }, + { + "epoch": 2.40401984480346, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.06369686126709, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8399490118026733, + "num_tokens": 721142108.0, + "step": 18898 + }, + { + "epoch": 2.4041470550820505, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8741981983184814, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.858425498008728, + "num_tokens": 721179875.0, + "step": 18899 + }, + { + "epoch": 2.404274265360641, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0551095008850098, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8613311052322388, + "num_tokens": 721217141.0, + "step": 18900 + }, + { + "epoch": 2.4044014756392316, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.94818913936615, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8594415783882141, + "num_tokens": 721256357.0, + "step": 18901 + }, + { + "epoch": 2.404528685917822, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9283021688461304, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.863621175289154, + "num_tokens": 721292610.0, + "step": 18902 + }, + { + "epoch": 2.4046558961964126, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.892091989517212, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8725384473800659, + "num_tokens": 721329048.0, + "step": 18903 + }, + { + "epoch": 2.404783106475003, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.280855178833008, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.878902792930603, + "num_tokens": 721364249.0, + "step": 18904 + }, + { + "epoch": 2.4049103167535937, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.358386516571045, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8721633553504944, + "num_tokens": 721391154.0, + "step": 18905 + }, + { + "epoch": 2.405037527032184, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.7014753818511963, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8762527704238892, + "num_tokens": 721428443.0, + "step": 18906 + }, + { + "epoch": 2.4051647373107747, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8815258741378784, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8544313907623291, + "num_tokens": 721474545.0, + "step": 18907 + }, + { + "epoch": 2.4052919475893653, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.093127489089966, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8645790815353394, + "num_tokens": 721513153.0, + "step": 18908 + }, + { + "epoch": 2.405419157867956, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9732331037521362, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8622487783432007, + "num_tokens": 721556287.0, + "step": 18909 + }, + { + "epoch": 2.4055463681465463, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0867080688476562, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8728890419006348, + "num_tokens": 721590219.0, + "step": 18910 + }, + { + "epoch": 2.405673578425137, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0329623222351074, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8653044700622559, + "num_tokens": 721627308.0, + "step": 18911 + }, + { + "epoch": 2.4058007887037274, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1655120849609375, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8619696497917175, + "num_tokens": 721664218.0, + "step": 18912 + }, + { + "epoch": 2.405927998982318, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0536227226257324, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8753505945205688, + "num_tokens": 721703511.0, + "step": 18913 + }, + { + "epoch": 2.4060552092609084, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.7867584228515625, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8778420686721802, + "num_tokens": 721741512.0, + "step": 18914 + }, + { + "epoch": 2.406182419539499, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0154531002044678, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8689737319946289, + "num_tokens": 721778443.0, + "step": 18915 + }, + { + "epoch": 2.4063096298180895, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0459721088409424, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8615688681602478, + "num_tokens": 721817226.0, + "step": 18916 + }, + { + "epoch": 2.4064368400966796, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8377984762191772, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8500080108642578, + "num_tokens": 721859419.0, + "step": 18917 + }, + { + "epoch": 2.4065640503752705, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7461705207824707, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8794713616371155, + "num_tokens": 721895183.0, + "step": 18918 + }, + { + "epoch": 2.4066912606538606, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9134881496429443, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8507005572319031, + "num_tokens": 721933566.0, + "step": 18919 + }, + { + "epoch": 2.406818470932451, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9547662734985352, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8679805994033813, + "num_tokens": 721968827.0, + "step": 18920 + }, + { + "epoch": 2.4069456812110417, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9836037158966064, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8683568239212036, + "num_tokens": 722004989.0, + "step": 18921 + }, + { + "epoch": 2.407072891489632, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.907423496246338, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8698955178260803, + "num_tokens": 722043681.0, + "step": 18922 + }, + { + "epoch": 2.4072001017682227, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8369144201278687, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8667945861816406, + "num_tokens": 722086541.0, + "step": 18923 + }, + { + "epoch": 2.4073273120468133, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8537023067474365, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8673059344291687, + "num_tokens": 722125519.0, + "step": 18924 + }, + { + "epoch": 2.407454522325404, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.007648229598999, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8621443510055542, + "num_tokens": 722157665.0, + "step": 18925 + }, + { + "epoch": 2.4075817326039943, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0143942832946777, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.878897488117218, + "num_tokens": 722191819.0, + "step": 18926 + }, + { + "epoch": 2.407708942882585, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0155537128448486, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8557735681533813, + "num_tokens": 722229755.0, + "step": 18927 + }, + { + "epoch": 2.4078361531611754, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0337347984313965, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8551188111305237, + "num_tokens": 722267440.0, + "step": 18928 + }, + { + "epoch": 2.407963363439766, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8452904224395752, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8624964952468872, + "num_tokens": 722310102.0, + "step": 18929 + }, + { + "epoch": 2.4080905737183564, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8589035272598267, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8588378429412842, + "num_tokens": 722346713.0, + "step": 18930 + }, + { + "epoch": 2.408217783996947, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8559869527816772, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8606206178665161, + "num_tokens": 722387803.0, + "step": 18931 + }, + { + "epoch": 2.4083449942755375, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0769295692443848, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.857315719127655, + "num_tokens": 722428943.0, + "step": 18932 + }, + { + "epoch": 2.408472204554128, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.02056622505188, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8593471050262451, + "num_tokens": 722467860.0, + "step": 18933 + }, + { + "epoch": 2.4085994148327186, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9372859001159668, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8615896701812744, + "num_tokens": 722503067.0, + "step": 18934 + }, + { + "epoch": 2.408726625111309, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9428949356079102, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8679218292236328, + "num_tokens": 722537842.0, + "step": 18935 + }, + { + "epoch": 2.4088538353898996, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2661476135253906, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8713081479072571, + "num_tokens": 722574036.0, + "step": 18936 + }, + { + "epoch": 2.40898104566849, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.054115056991577, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8651554584503174, + "num_tokens": 722606087.0, + "step": 18937 + }, + { + "epoch": 2.4091082559470807, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7996556758880615, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8647150993347168, + "num_tokens": 722648745.0, + "step": 18938 + }, + { + "epoch": 2.409235466225671, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0130836963653564, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.857427179813385, + "num_tokens": 722688589.0, + "step": 18939 + }, + { + "epoch": 2.4093626765042617, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9789091348648071, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8845508694648743, + "num_tokens": 722723941.0, + "step": 18940 + }, + { + "epoch": 2.4094898867828523, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.832303762435913, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.849368691444397, + "num_tokens": 722768242.0, + "step": 18941 + }, + { + "epoch": 2.4096170970614423, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9674718379974365, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8658303022384644, + "num_tokens": 722800970.0, + "step": 18942 + }, + { + "epoch": 2.4097443073400333, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9698684215545654, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8558806777000427, + "num_tokens": 722839168.0, + "step": 18943 + }, + { + "epoch": 2.4098715176186234, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8707878589630127, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8760474324226379, + "num_tokens": 722883451.0, + "step": 18944 + }, + { + "epoch": 2.409998727897214, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7812007665634155, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8663082122802734, + "num_tokens": 722928581.0, + "step": 18945 + }, + { + "epoch": 2.4101259381758044, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8753670454025269, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8769703507423401, + "num_tokens": 722967141.0, + "step": 18946 + }, + { + "epoch": 2.410253148454395, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8283569812774658, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8607151508331299, + "num_tokens": 723006263.0, + "step": 18947 + }, + { + "epoch": 2.4103803587329855, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7556190490722656, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8680480718612671, + "num_tokens": 723050843.0, + "step": 18948 + }, + { + "epoch": 2.410507569011576, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.81488835811615, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8656972646713257, + "num_tokens": 723092124.0, + "step": 18949 + }, + { + "epoch": 2.4106347792901666, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9399603605270386, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8760133981704712, + "num_tokens": 723122478.0, + "step": 18950 + }, + { + "epoch": 2.410761989568757, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.043118476867676, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8591412901878357, + "num_tokens": 723163054.0, + "step": 18951 + }, + { + "epoch": 2.4108891998473476, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7293998003005981, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8660464286804199, + "num_tokens": 723201518.0, + "step": 18952 + }, + { + "epoch": 2.411016410125938, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9806652069091797, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8668954968452454, + "num_tokens": 723235685.0, + "step": 18953 + }, + { + "epoch": 2.4111436204045287, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9780359268188477, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8646694421768188, + "num_tokens": 723266510.0, + "step": 18954 + }, + { + "epoch": 2.411270830683119, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.776227593421936, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8695462346076965, + "num_tokens": 723302800.0, + "step": 18955 + }, + { + "epoch": 2.4113980409617097, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9218932390213013, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8674770593643188, + "num_tokens": 723340669.0, + "step": 18956 + }, + { + "epoch": 2.4115252512403003, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8063548803329468, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8683171272277832, + "num_tokens": 723377556.0, + "step": 18957 + }, + { + "epoch": 2.411652461518891, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.887310266494751, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8696565628051758, + "num_tokens": 723410942.0, + "step": 18958 + }, + { + "epoch": 2.4117796717974813, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7641016244888306, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8774542212486267, + "num_tokens": 723451131.0, + "step": 18959 + }, + { + "epoch": 2.411906882076072, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.083616018295288, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8685583472251892, + "num_tokens": 723485876.0, + "step": 18960 + }, + { + "epoch": 2.4120340923546624, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5739235877990723, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8719934821128845, + "num_tokens": 723521252.0, + "step": 18961 + }, + { + "epoch": 2.412161302633253, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.080888032913208, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8507840633392334, + "num_tokens": 723561094.0, + "step": 18962 + }, + { + "epoch": 2.4122885129118434, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9676709175109863, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.863686203956604, + "num_tokens": 723597281.0, + "step": 18963 + }, + { + "epoch": 2.412415723190434, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.029956102371216, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.888688325881958, + "num_tokens": 723637690.0, + "step": 18964 + }, + { + "epoch": 2.4125429334690245, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9305503368377686, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.86528480052948, + "num_tokens": 723676250.0, + "step": 18965 + }, + { + "epoch": 2.412670143747615, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8174327611923218, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8735122680664062, + "num_tokens": 723713157.0, + "step": 18966 + }, + { + "epoch": 2.412797354026205, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2337453365325928, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8707213401794434, + "num_tokens": 723745877.0, + "step": 18967 + }, + { + "epoch": 2.412924564304796, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8793058395385742, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8666293621063232, + "num_tokens": 723784285.0, + "step": 18968 + }, + { + "epoch": 2.413051774583386, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0429747104644775, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8549994230270386, + "num_tokens": 723817085.0, + "step": 18969 + }, + { + "epoch": 2.4131789848619767, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.056610584259033, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8606843948364258, + "num_tokens": 723858466.0, + "step": 18970 + }, + { + "epoch": 2.413306195140567, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.006165027618408, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8654229640960693, + "num_tokens": 723896627.0, + "step": 18971 + }, + { + "epoch": 2.4134334054191577, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.904436707496643, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8610483407974243, + "num_tokens": 723930727.0, + "step": 18972 + }, + { + "epoch": 2.4135606156977483, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0159409046173096, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8546464443206787, + "num_tokens": 723965996.0, + "step": 18973 + }, + { + "epoch": 2.413687825976339, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.155046224594116, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8594973087310791, + "num_tokens": 724002441.0, + "step": 18974 + }, + { + "epoch": 2.4138150362549293, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8573031425476074, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8704423904418945, + "num_tokens": 724039238.0, + "step": 18975 + }, + { + "epoch": 2.41394224653352, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8384652137756348, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8714970350265503, + "num_tokens": 724077430.0, + "step": 18976 + }, + { + "epoch": 2.4140694568121104, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7762023210525513, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8642590045928955, + "num_tokens": 724113175.0, + "step": 18977 + }, + { + "epoch": 2.414196667090701, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.053220272064209, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8609147071838379, + "num_tokens": 724156409.0, + "step": 18978 + }, + { + "epoch": 2.4143238773692914, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8531028032302856, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8652949333190918, + "num_tokens": 724199879.0, + "step": 18979 + }, + { + "epoch": 2.414451087647882, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8870351314544678, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8804073333740234, + "num_tokens": 724238065.0, + "step": 18980 + }, + { + "epoch": 2.4145782979264725, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.049072504043579, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8585125803947449, + "num_tokens": 724272336.0, + "step": 18981 + }, + { + "epoch": 2.414705508205063, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.01725435256958, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8564234972000122, + "num_tokens": 724305200.0, + "step": 18982 + }, + { + "epoch": 2.4148327184836536, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.996004343032837, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8703441619873047, + "num_tokens": 724341295.0, + "step": 18983 + }, + { + "epoch": 2.414959928762244, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9327607154846191, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8682390451431274, + "num_tokens": 724384336.0, + "step": 18984 + }, + { + "epoch": 2.4150871390408346, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9904720783233643, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8619275689125061, + "num_tokens": 724417279.0, + "step": 18985 + }, + { + "epoch": 2.415214349319425, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7782503366470337, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8656800985336304, + "num_tokens": 724456509.0, + "step": 18986 + }, + { + "epoch": 2.4153415595980157, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6871789693832397, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8768147230148315, + "num_tokens": 724498812.0, + "step": 18987 + }, + { + "epoch": 2.415468769876606, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0067055225372314, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8545471429824829, + "num_tokens": 724538174.0, + "step": 18988 + }, + { + "epoch": 2.4155959801551967, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8637723922729492, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8702161908149719, + "num_tokens": 724577171.0, + "step": 18989 + }, + { + "epoch": 2.415723190433787, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 80.52132415771484, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8869683742523193, + "num_tokens": 724610626.0, + "step": 18990 + }, + { + "epoch": 2.4158504007123778, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0442869663238525, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8774142861366272, + "num_tokens": 724647613.0, + "step": 18991 + }, + { + "epoch": 2.415977610990968, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9773521423339844, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8677539825439453, + "num_tokens": 724684898.0, + "step": 18992 + }, + { + "epoch": 2.4161048212695584, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7836633920669556, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8633576035499573, + "num_tokens": 724725566.0, + "step": 18993 + }, + { + "epoch": 2.416232031548149, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8988564014434814, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8572804927825928, + "num_tokens": 724763701.0, + "step": 18994 + }, + { + "epoch": 2.4163592418267394, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.081643581390381, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8635882139205933, + "num_tokens": 724798108.0, + "step": 18995 + }, + { + "epoch": 2.41648645210533, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9587689638137817, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8446996808052063, + "num_tokens": 724842430.0, + "step": 18996 + }, + { + "epoch": 2.4166136623839205, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7902636528015137, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8608698844909668, + "num_tokens": 724884585.0, + "step": 18997 + }, + { + "epoch": 2.416740872662511, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8583623170852661, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8778220415115356, + "num_tokens": 724919338.0, + "step": 18998 + }, + { + "epoch": 2.4168680829411016, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7855504751205444, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8883284330368042, + "num_tokens": 724956007.0, + "step": 18999 + }, + { + "epoch": 2.416995293219692, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8589495420455933, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8819759488105774, + "num_tokens": 724996994.0, + "step": 19000 + }, + { + "epoch": 2.4171225034982826, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1580092906951904, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8588269948959351, + "num_tokens": 725035607.0, + "step": 19001 + }, + { + "epoch": 2.417249713776873, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.909271240234375, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8696539998054504, + "num_tokens": 725075370.0, + "step": 19002 + }, + { + "epoch": 2.4173769240554637, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.731621503829956, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8645793795585632, + "num_tokens": 725119536.0, + "step": 19003 + }, + { + "epoch": 2.417504134334054, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7616908550262451, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8657065629959106, + "num_tokens": 725158048.0, + "step": 19004 + }, + { + "epoch": 2.4176313446126447, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.065704584121704, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8754124641418457, + "num_tokens": 725196343.0, + "step": 19005 + }, + { + "epoch": 2.4177585548912353, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.869034767150879, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8678818941116333, + "num_tokens": 725234613.0, + "step": 19006 + }, + { + "epoch": 2.417885765169826, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.820380449295044, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8635687828063965, + "num_tokens": 725272129.0, + "step": 19007 + }, + { + "epoch": 2.4180129754484163, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1602513790130615, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8572565913200378, + "num_tokens": 725305740.0, + "step": 19008 + }, + { + "epoch": 2.418140185727007, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1466128826141357, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8626003265380859, + "num_tokens": 725340579.0, + "step": 19009 + }, + { + "epoch": 2.4182673960055974, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7906413078308105, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8676444292068481, + "num_tokens": 725380492.0, + "step": 19010 + }, + { + "epoch": 2.418394606284188, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.7795655727386475, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8668870329856873, + "num_tokens": 725418043.0, + "step": 19011 + }, + { + "epoch": 2.4185218165627784, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 16.62568473815918, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8575495481491089, + "num_tokens": 725456260.0, + "step": 19012 + }, + { + "epoch": 2.418649026841369, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0703256130218506, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8625685572624207, + "num_tokens": 725496605.0, + "step": 19013 + }, + { + "epoch": 2.4187762371199595, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0211799144744873, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8660767674446106, + "num_tokens": 725532395.0, + "step": 19014 + }, + { + "epoch": 2.4189034473985496, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8759099245071411, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.860495924949646, + "num_tokens": 725575335.0, + "step": 19015 + }, + { + "epoch": 2.4190306576771405, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7699038982391357, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8723275661468506, + "num_tokens": 725612345.0, + "step": 19016 + }, + { + "epoch": 2.4191578679557306, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1505305767059326, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8689996004104614, + "num_tokens": 725645566.0, + "step": 19017 + }, + { + "epoch": 2.419285078234321, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8763642311096191, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8616353273391724, + "num_tokens": 725684218.0, + "step": 19018 + }, + { + "epoch": 2.4194122885129117, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8646454811096191, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8707982897758484, + "num_tokens": 725718478.0, + "step": 19019 + }, + { + "epoch": 2.419539498791502, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9090783596038818, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8593481779098511, + "num_tokens": 725756208.0, + "step": 19020 + }, + { + "epoch": 2.4196667090700927, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9035519361495972, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8604907989501953, + "num_tokens": 725791884.0, + "step": 19021 + }, + { + "epoch": 2.4197939193486833, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9462929964065552, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8695423603057861, + "num_tokens": 725829138.0, + "step": 19022 + }, + { + "epoch": 2.419921129627274, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7984988689422607, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8670414686203003, + "num_tokens": 725868301.0, + "step": 19023 + }, + { + "epoch": 2.4200483399058643, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8086425065994263, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8535696268081665, + "num_tokens": 725906186.0, + "step": 19024 + }, + { + "epoch": 2.420175550184455, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9350969791412354, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.85411137342453, + "num_tokens": 725947006.0, + "step": 19025 + }, + { + "epoch": 2.4203027604630454, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9222462177276611, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8687440156936646, + "num_tokens": 725987363.0, + "step": 19026 + }, + { + "epoch": 2.420429970741636, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8760708570480347, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.856309711933136, + "num_tokens": 726026176.0, + "step": 19027 + }, + { + "epoch": 2.4205571810202264, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8603601455688477, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8638997673988342, + "num_tokens": 726068124.0, + "step": 19028 + }, + { + "epoch": 2.420684391298817, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.848968744277954, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8584463000297546, + "num_tokens": 726110270.0, + "step": 19029 + }, + { + "epoch": 2.4208116015774075, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9351218938827515, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8808377981185913, + "num_tokens": 726143908.0, + "step": 19030 + }, + { + "epoch": 2.420938811855998, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.056154251098633, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.863834023475647, + "num_tokens": 726179082.0, + "step": 19031 + }, + { + "epoch": 2.4210660221345885, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9807653427124023, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8591958284378052, + "num_tokens": 726216387.0, + "step": 19032 + }, + { + "epoch": 2.421193232413179, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8571171760559082, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8749083876609802, + "num_tokens": 726258831.0, + "step": 19033 + }, + { + "epoch": 2.4213204426917696, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.094729423522949, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.866938591003418, + "num_tokens": 726301929.0, + "step": 19034 + }, + { + "epoch": 2.42144765297036, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.144179344177246, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8685334324836731, + "num_tokens": 726339938.0, + "step": 19035 + }, + { + "epoch": 2.4215748632489507, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8294318914413452, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8676258325576782, + "num_tokens": 726377197.0, + "step": 19036 + }, + { + "epoch": 2.421702073527541, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8334715366363525, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8692249655723572, + "num_tokens": 726417591.0, + "step": 19037 + }, + { + "epoch": 2.4218292838061317, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9476732015609741, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8547124266624451, + "num_tokens": 726456094.0, + "step": 19038 + }, + { + "epoch": 2.4219564940847222, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9080830812454224, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8749006986618042, + "num_tokens": 726491158.0, + "step": 19039 + }, + { + "epoch": 2.4220837043633123, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.161113739013672, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8520767092704773, + "num_tokens": 726531334.0, + "step": 19040 + }, + { + "epoch": 2.4222109146419033, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9069690704345703, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8575003147125244, + "num_tokens": 726571999.0, + "step": 19041 + }, + { + "epoch": 2.4223381249204934, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1357181072235107, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8561241030693054, + "num_tokens": 726604945.0, + "step": 19042 + }, + { + "epoch": 2.422465335199084, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9620997905731201, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8575806617736816, + "num_tokens": 726645023.0, + "step": 19043 + }, + { + "epoch": 2.4225925454776744, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8827236890792847, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8659617900848389, + "num_tokens": 726684297.0, + "step": 19044 + }, + { + "epoch": 2.422719755756265, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7761608362197876, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8553783893585205, + "num_tokens": 726726950.0, + "step": 19045 + }, + { + "epoch": 2.4228469660348555, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.070589542388916, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8474112153053284, + "num_tokens": 726761944.0, + "step": 19046 + }, + { + "epoch": 2.422974176313446, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.852987289428711, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8751010298728943, + "num_tokens": 726799041.0, + "step": 19047 + }, + { + "epoch": 2.4231013865920366, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8370743989944458, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8588224649429321, + "num_tokens": 726838160.0, + "step": 19048 + }, + { + "epoch": 2.423228596870627, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.128858804702759, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8748272061347961, + "num_tokens": 726873150.0, + "step": 19049 + }, + { + "epoch": 2.4233558071492176, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0712943077087402, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8703890442848206, + "num_tokens": 726905253.0, + "step": 19050 + }, + { + "epoch": 2.423483017427808, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9845205545425415, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8773166537284851, + "num_tokens": 726942842.0, + "step": 19051 + }, + { + "epoch": 2.4236102277063987, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7708234786987305, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8706861734390259, + "num_tokens": 726987974.0, + "step": 19052 + }, + { + "epoch": 2.423737437984989, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8207406997680664, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8842231035232544, + "num_tokens": 727021614.0, + "step": 19053 + }, + { + "epoch": 2.4238646482635797, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9975786209106445, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8637369871139526, + "num_tokens": 727061773.0, + "step": 19054 + }, + { + "epoch": 2.4239918585421703, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9956454038619995, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8591827154159546, + "num_tokens": 727095877.0, + "step": 19055 + }, + { + "epoch": 2.424119068820761, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0878102779388428, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8692032098770142, + "num_tokens": 727128676.0, + "step": 19056 + }, + { + "epoch": 2.4242462790993513, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8273369073867798, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8748584389686584, + "num_tokens": 727164528.0, + "step": 19057 + }, + { + "epoch": 2.424373489377942, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8603055477142334, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8728723526000977, + "num_tokens": 727202403.0, + "step": 19058 + }, + { + "epoch": 2.4245006996565324, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.0468015670776367, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.867699146270752, + "num_tokens": 727242231.0, + "step": 19059 + }, + { + "epoch": 2.424627909935123, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1459591388702393, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8512584567070007, + "num_tokens": 727276371.0, + "step": 19060 + }, + { + "epoch": 2.4247551202137134, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9679185152053833, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8684719204902649, + "num_tokens": 727320101.0, + "step": 19061 + }, + { + "epoch": 2.424882330492304, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7930316925048828, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8566992282867432, + "num_tokens": 727363687.0, + "step": 19062 + }, + { + "epoch": 2.4250095407708945, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8859601020812988, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8705092072486877, + "num_tokens": 727399772.0, + "step": 19063 + }, + { + "epoch": 2.425136751049485, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8477299213409424, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8747844696044922, + "num_tokens": 727442944.0, + "step": 19064 + }, + { + "epoch": 2.425263961328075, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9163483381271362, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8741908073425293, + "num_tokens": 727483283.0, + "step": 19065 + }, + { + "epoch": 2.425391171606666, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.887555718421936, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8634577989578247, + "num_tokens": 727526316.0, + "step": 19066 + }, + { + "epoch": 2.425518381885256, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.0449306964874268, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8735594749450684, + "num_tokens": 727563218.0, + "step": 19067 + }, + { + "epoch": 2.4256455921638467, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0475852489471436, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8785608410835266, + "num_tokens": 727594877.0, + "step": 19068 + }, + { + "epoch": 2.425772802442437, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8553273677825928, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8785325288772583, + "num_tokens": 727627561.0, + "step": 19069 + }, + { + "epoch": 2.4259000127210277, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8378878831863403, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8579800128936768, + "num_tokens": 727664384.0, + "step": 19070 + }, + { + "epoch": 2.4260272229996183, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8963617086410522, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8614161014556885, + "num_tokens": 727700866.0, + "step": 19071 + }, + { + "epoch": 2.426154433278209, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.09708309173584, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8843032121658325, + "num_tokens": 727741539.0, + "step": 19072 + }, + { + "epoch": 2.4262816435567993, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.021772623062134, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8674314618110657, + "num_tokens": 727777828.0, + "step": 19073 + }, + { + "epoch": 2.42640885383539, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7163299322128296, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8745759725570679, + "num_tokens": 727818979.0, + "step": 19074 + }, + { + "epoch": 2.4265360641139804, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9196373224258423, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.865747332572937, + "num_tokens": 727856440.0, + "step": 19075 + }, + { + "epoch": 2.426663274392571, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8531460762023926, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.875505268573761, + "num_tokens": 727895036.0, + "step": 19076 + }, + { + "epoch": 2.4267904846711614, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9701226949691772, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8548740148544312, + "num_tokens": 727932292.0, + "step": 19077 + }, + { + "epoch": 2.426917694949752, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.933422565460205, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8669737577438354, + "num_tokens": 727967005.0, + "step": 19078 + }, + { + "epoch": 2.4270449052283425, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9601516723632812, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8569127321243286, + "num_tokens": 728004262.0, + "step": 19079 + }, + { + "epoch": 2.427172115506933, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9361801147460938, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8749855756759644, + "num_tokens": 728041451.0, + "step": 19080 + }, + { + "epoch": 2.4272993257855235, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8463945388793945, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8641554117202759, + "num_tokens": 728079217.0, + "step": 19081 + }, + { + "epoch": 2.427426536064114, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.79874587059021, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8826152086257935, + "num_tokens": 728116221.0, + "step": 19082 + }, + { + "epoch": 2.4275537463427046, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0703814029693604, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8571080565452576, + "num_tokens": 728154861.0, + "step": 19083 + }, + { + "epoch": 2.427680956621295, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.989660620689392, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8542209267616272, + "num_tokens": 728188276.0, + "step": 19084 + }, + { + "epoch": 2.4278081668998857, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0224766731262207, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8774919509887695, + "num_tokens": 728229313.0, + "step": 19085 + }, + { + "epoch": 2.427935377178476, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9134900569915771, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8560723066329956, + "num_tokens": 728268706.0, + "step": 19086 + }, + { + "epoch": 2.4280625874570667, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.931702733039856, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8797316551208496, + "num_tokens": 728297490.0, + "step": 19087 + }, + { + "epoch": 2.428189797735657, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.192385673522949, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8598951697349548, + "num_tokens": 728336041.0, + "step": 19088 + }, + { + "epoch": 2.4283170080142478, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9497498273849487, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8739729523658752, + "num_tokens": 728372701.0, + "step": 19089 + }, + { + "epoch": 2.428444218292838, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9316054582595825, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8692181706428528, + "num_tokens": 728408138.0, + "step": 19090 + }, + { + "epoch": 2.4285714285714284, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7805145978927612, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8723263144493103, + "num_tokens": 728452649.0, + "step": 19091 + }, + { + "epoch": 2.428698638850019, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8977645635604858, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8563429713249207, + "num_tokens": 728490551.0, + "step": 19092 + }, + { + "epoch": 2.4288258491286094, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9024771451950073, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8717238903045654, + "num_tokens": 728529558.0, + "step": 19093 + }, + { + "epoch": 2.4289530594072, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8148642778396606, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8424603939056396, + "num_tokens": 728573495.0, + "step": 19094 + }, + { + "epoch": 2.4290802696857905, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3890604972839355, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8661614060401917, + "num_tokens": 728609166.0, + "step": 19095 + }, + { + "epoch": 2.429207479964381, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7245314121246338, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8736484050750732, + "num_tokens": 728653070.0, + "step": 19096 + }, + { + "epoch": 2.4293346902429716, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8568029403686523, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8682969808578491, + "num_tokens": 728689142.0, + "step": 19097 + }, + { + "epoch": 2.429461900521562, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.003775119781494, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.86896151304245, + "num_tokens": 728725615.0, + "step": 19098 + }, + { + "epoch": 2.4295891108001526, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9969450235366821, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.85700523853302, + "num_tokens": 728762366.0, + "step": 19099 + }, + { + "epoch": 2.429716321078743, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8055599927902222, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8557248115539551, + "num_tokens": 728801951.0, + "step": 19100 + }, + { + "epoch": 2.4298435313573337, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.928824543952942, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8744148015975952, + "num_tokens": 728836403.0, + "step": 19101 + }, + { + "epoch": 2.429970741635924, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8643344640731812, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8736544847488403, + "num_tokens": 728878485.0, + "step": 19102 + }, + { + "epoch": 2.4300979519145147, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2200262546539307, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8757935762405396, + "num_tokens": 728915935.0, + "step": 19103 + }, + { + "epoch": 2.4302251621931052, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.924636721611023, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8489649295806885, + "num_tokens": 728956847.0, + "step": 19104 + }, + { + "epoch": 2.4303523724716958, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.918180227279663, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8705723881721497, + "num_tokens": 728994865.0, + "step": 19105 + }, + { + "epoch": 2.4304795827502863, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9023305177688599, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8581984639167786, + "num_tokens": 729033030.0, + "step": 19106 + }, + { + "epoch": 2.430606793028877, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9451061487197876, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8777326941490173, + "num_tokens": 729070430.0, + "step": 19107 + }, + { + "epoch": 2.4307340033074674, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2647714614868164, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8777602910995483, + "num_tokens": 729102127.0, + "step": 19108 + }, + { + "epoch": 2.430861213586058, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.212768793106079, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8675152659416199, + "num_tokens": 729138128.0, + "step": 19109 + }, + { + "epoch": 2.4309884238646484, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8043404817581177, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8633330464363098, + "num_tokens": 729179756.0, + "step": 19110 + }, + { + "epoch": 2.431115634143239, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.054696798324585, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8548530340194702, + "num_tokens": 729221356.0, + "step": 19111 + }, + { + "epoch": 2.4312428444218295, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.844267725944519, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8693065047264099, + "num_tokens": 729260456.0, + "step": 19112 + }, + { + "epoch": 2.4313700547004196, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8310935497283936, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8552299737930298, + "num_tokens": 729299654.0, + "step": 19113 + }, + { + "epoch": 2.4314972649790105, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8363581895828247, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8690975904464722, + "num_tokens": 729341296.0, + "step": 19114 + }, + { + "epoch": 2.4316244752576006, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.876771330833435, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8698974847793579, + "num_tokens": 729379632.0, + "step": 19115 + }, + { + "epoch": 2.431751685536191, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8096997737884521, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8678464889526367, + "num_tokens": 729423202.0, + "step": 19116 + }, + { + "epoch": 2.4318788958147817, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.856035590171814, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8853487372398376, + "num_tokens": 729457108.0, + "step": 19117 + }, + { + "epoch": 2.432006106093372, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0250489711761475, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8620511889457703, + "num_tokens": 729494065.0, + "step": 19118 + }, + { + "epoch": 2.4321333163719627, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7781562805175781, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8706983327865601, + "num_tokens": 729536621.0, + "step": 19119 + }, + { + "epoch": 2.4322605266505533, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2426199913024902, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8661932945251465, + "num_tokens": 729574423.0, + "step": 19120 + }, + { + "epoch": 2.432387736929144, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9575554132461548, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8611732721328735, + "num_tokens": 729614673.0, + "step": 19121 + }, + { + "epoch": 2.4325149472077343, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.931951880455017, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8665423393249512, + "num_tokens": 729653049.0, + "step": 19122 + }, + { + "epoch": 2.432642157486325, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8990315198898315, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.864503026008606, + "num_tokens": 729689667.0, + "step": 19123 + }, + { + "epoch": 2.4327693677649154, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.810634970664978, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8705251216888428, + "num_tokens": 729734747.0, + "step": 19124 + }, + { + "epoch": 2.432896578043506, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.996014952659607, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8617081046104431, + "num_tokens": 729770135.0, + "step": 19125 + }, + { + "epoch": 2.4330237883220964, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8410491943359375, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8649235367774963, + "num_tokens": 729809203.0, + "step": 19126 + }, + { + "epoch": 2.433150998600687, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8473973274230957, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.871963620185852, + "num_tokens": 729850164.0, + "step": 19127 + }, + { + "epoch": 2.4332782088792775, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.00346040725708, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8557305335998535, + "num_tokens": 729884337.0, + "step": 19128 + }, + { + "epoch": 2.433405419157868, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.742546796798706, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8790286779403687, + "num_tokens": 729924098.0, + "step": 19129 + }, + { + "epoch": 2.4335326294364585, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.929625153541565, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8855053782463074, + "num_tokens": 729960671.0, + "step": 19130 + }, + { + "epoch": 2.433659839715049, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7873246669769287, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8677289485931396, + "num_tokens": 730000780.0, + "step": 19131 + }, + { + "epoch": 2.4337870499936396, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9432340860366821, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8643789291381836, + "num_tokens": 730045198.0, + "step": 19132 + }, + { + "epoch": 2.43391426027223, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1434309482574463, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8606729507446289, + "num_tokens": 730085010.0, + "step": 19133 + }, + { + "epoch": 2.4340414705508207, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.007486581802368, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8618902564048767, + "num_tokens": 730121512.0, + "step": 19134 + }, + { + "epoch": 2.434168680829411, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8427060842514038, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8656326532363892, + "num_tokens": 730164859.0, + "step": 19135 + }, + { + "epoch": 2.4342958911080017, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8211994171142578, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8674896359443665, + "num_tokens": 730202273.0, + "step": 19136 + }, + { + "epoch": 2.4344231013865922, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8457903861999512, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8590570688247681, + "num_tokens": 730248208.0, + "step": 19137 + }, + { + "epoch": 2.4345503116651823, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8200018405914307, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.874155580997467, + "num_tokens": 730291084.0, + "step": 19138 + }, + { + "epoch": 2.4346775219437733, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.82571280002594, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8704941272735596, + "num_tokens": 730334504.0, + "step": 19139 + }, + { + "epoch": 2.4348047322223634, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9942353963851929, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8706310987472534, + "num_tokens": 730373356.0, + "step": 19140 + }, + { + "epoch": 2.434931942500954, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.86763596534729, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8521608114242554, + "num_tokens": 730416229.0, + "step": 19141 + }, + { + "epoch": 2.4350591527795444, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7793749570846558, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8795090913772583, + "num_tokens": 730458922.0, + "step": 19142 + }, + { + "epoch": 2.435186363058135, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8434202671051025, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8656386137008667, + "num_tokens": 730494583.0, + "step": 19143 + }, + { + "epoch": 2.4353135733367255, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.935823917388916, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.867484986782074, + "num_tokens": 730529526.0, + "step": 19144 + }, + { + "epoch": 2.435440783615316, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2050251960754395, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8703111410140991, + "num_tokens": 730558584.0, + "step": 19145 + }, + { + "epoch": 2.4355679938939065, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8254499435424805, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8662369251251221, + "num_tokens": 730598677.0, + "step": 19146 + }, + { + "epoch": 2.435695204172497, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8982877731323242, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8751699924468994, + "num_tokens": 730636985.0, + "step": 19147 + }, + { + "epoch": 2.4358224144510876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0108861923217773, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8483500480651855, + "num_tokens": 730675409.0, + "step": 19148 + }, + { + "epoch": 2.435949624729678, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0987765789031982, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8536281585693359, + "num_tokens": 730714739.0, + "step": 19149 + }, + { + "epoch": 2.4360768350082687, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9291446208953857, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8641380071640015, + "num_tokens": 730757059.0, + "step": 19150 + }, + { + "epoch": 2.436204045286859, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.015942335128784, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8648156523704529, + "num_tokens": 730793469.0, + "step": 19151 + }, + { + "epoch": 2.4363312555654497, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8787925243377686, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.871907114982605, + "num_tokens": 730829336.0, + "step": 19152 + }, + { + "epoch": 2.4364584658440402, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9756267070770264, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8640884160995483, + "num_tokens": 730869304.0, + "step": 19153 + }, + { + "epoch": 2.4365856761226308, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1535236835479736, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8553911447525024, + "num_tokens": 730905315.0, + "step": 19154 + }, + { + "epoch": 2.4367128864012213, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8510844707489014, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.857290506362915, + "num_tokens": 730947346.0, + "step": 19155 + }, + { + "epoch": 2.436840096679812, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0102896690368652, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8713287115097046, + "num_tokens": 730976907.0, + "step": 19156 + }, + { + "epoch": 2.4369673069584024, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8512368202209473, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8619785904884338, + "num_tokens": 731015940.0, + "step": 19157 + }, + { + "epoch": 2.437094517236993, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8131104707717896, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8840211629867554, + "num_tokens": 731051345.0, + "step": 19158 + }, + { + "epoch": 2.4372217275155834, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0216963291168213, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8340876698493958, + "num_tokens": 731089642.0, + "step": 19159 + }, + { + "epoch": 2.437348937794174, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8090558052062988, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.883625328540802, + "num_tokens": 731133196.0, + "step": 19160 + }, + { + "epoch": 2.4374761480727645, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0177090167999268, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8548528552055359, + "num_tokens": 731175510.0, + "step": 19161 + }, + { + "epoch": 2.437603358351355, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.746876835823059, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8733266592025757, + "num_tokens": 731217526.0, + "step": 19162 + }, + { + "epoch": 2.437730568629945, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8014154434204102, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.854651689529419, + "num_tokens": 731259525.0, + "step": 19163 + }, + { + "epoch": 2.437857778908536, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8812519311904907, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8713908791542053, + "num_tokens": 731301480.0, + "step": 19164 + }, + { + "epoch": 2.437984989187126, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7252247333526611, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8744338154792786, + "num_tokens": 731347385.0, + "step": 19165 + }, + { + "epoch": 2.4381121994657167, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8980731964111328, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8698323965072632, + "num_tokens": 731383666.0, + "step": 19166 + }, + { + "epoch": 2.438239409744307, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9079911708831787, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8663623929023743, + "num_tokens": 731420297.0, + "step": 19167 + }, + { + "epoch": 2.4383666200228977, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0389609336853027, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8624432682991028, + "num_tokens": 731455083.0, + "step": 19168 + }, + { + "epoch": 2.4384938303014883, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.236868143081665, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8747355341911316, + "num_tokens": 731482555.0, + "step": 19169 + }, + { + "epoch": 2.438621040580079, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.083475112915039, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8776124119758606, + "num_tokens": 731519459.0, + "step": 19170 + }, + { + "epoch": 2.4387482508586693, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9266985654830933, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8707945346832275, + "num_tokens": 731554577.0, + "step": 19171 + }, + { + "epoch": 2.43887546113726, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.98731529712677, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8580110669136047, + "num_tokens": 731594993.0, + "step": 19172 + }, + { + "epoch": 2.4390026714158504, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.038196563720703, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8705093264579773, + "num_tokens": 731629182.0, + "step": 19173 + }, + { + "epoch": 2.439129881694441, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9507884979248047, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8639951944351196, + "num_tokens": 731667661.0, + "step": 19174 + }, + { + "epoch": 2.4392570919730314, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.950446367263794, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8670724630355835, + "num_tokens": 731704911.0, + "step": 19175 + }, + { + "epoch": 2.439384302251622, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1366498470306396, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8488927483558655, + "num_tokens": 731744990.0, + "step": 19176 + }, + { + "epoch": 2.4395115125302125, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8938496112823486, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8679472208023071, + "num_tokens": 731783335.0, + "step": 19177 + }, + { + "epoch": 2.439638722808803, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9423716068267822, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8687946200370789, + "num_tokens": 731819206.0, + "step": 19178 + }, + { + "epoch": 2.4397659330873935, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9959220886230469, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8631526231765747, + "num_tokens": 731855291.0, + "step": 19179 + }, + { + "epoch": 2.439893143365984, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.938248872756958, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8694406747817993, + "num_tokens": 731899392.0, + "step": 19180 + }, + { + "epoch": 2.4400203536445746, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9803508520126343, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8664364814758301, + "num_tokens": 731937502.0, + "step": 19181 + }, + { + "epoch": 2.440147563923165, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0853116512298584, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8705638647079468, + "num_tokens": 731974946.0, + "step": 19182 + }, + { + "epoch": 2.4402747742017556, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9141955375671387, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8547453284263611, + "num_tokens": 732018732.0, + "step": 19183 + }, + { + "epoch": 2.440401984480346, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.072533369064331, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8649777770042419, + "num_tokens": 732052036.0, + "step": 19184 + }, + { + "epoch": 2.4405291947589367, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9064326286315918, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.86212158203125, + "num_tokens": 732090944.0, + "step": 19185 + }, + { + "epoch": 2.440656405037527, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8862758874893188, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8679365515708923, + "num_tokens": 732131406.0, + "step": 19186 + }, + { + "epoch": 2.4407836153161178, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8932926654815674, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8656340837478638, + "num_tokens": 732171701.0, + "step": 19187 + }, + { + "epoch": 2.440910825594708, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9612905979156494, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8752243518829346, + "num_tokens": 732209658.0, + "step": 19188 + }, + { + "epoch": 2.4410380358732984, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8766300678253174, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8549747467041016, + "num_tokens": 732251395.0, + "step": 19189 + }, + { + "epoch": 2.441165246151889, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9634865522384644, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8567816019058228, + "num_tokens": 732289306.0, + "step": 19190 + }, + { + "epoch": 2.4412924564304794, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 9.622174263000488, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8512839078903198, + "num_tokens": 732337916.0, + "step": 19191 + }, + { + "epoch": 2.44141966670907, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0726451873779297, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.852782130241394, + "num_tokens": 732380358.0, + "step": 19192 + }, + { + "epoch": 2.4415468769876605, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9915128946304321, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8763200044631958, + "num_tokens": 732416534.0, + "step": 19193 + }, + { + "epoch": 2.441674087266251, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8494924306869507, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8454360961914062, + "num_tokens": 732458927.0, + "step": 19194 + }, + { + "epoch": 2.4418012975448415, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7169148921966553, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8586576581001282, + "num_tokens": 732501261.0, + "step": 19195 + }, + { + "epoch": 2.441928507823432, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9026787281036377, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.874069094657898, + "num_tokens": 732541162.0, + "step": 19196 + }, + { + "epoch": 2.4420557181020226, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9892821311950684, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8631205558776855, + "num_tokens": 732578237.0, + "step": 19197 + }, + { + "epoch": 2.442182928380613, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8954230546951294, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8550477623939514, + "num_tokens": 732618624.0, + "step": 19198 + }, + { + "epoch": 2.4423101386592037, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8654502630233765, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8631496429443359, + "num_tokens": 732661747.0, + "step": 19199 + }, + { + "epoch": 2.442437348937794, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9663457870483398, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8451074957847595, + "num_tokens": 732703511.0, + "step": 19200 + }, + { + "epoch": 2.4425645592163847, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0963220596313477, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8458276391029358, + "num_tokens": 732746749.0, + "step": 19201 + }, + { + "epoch": 2.4426917694949752, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9397097826004028, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8666043281555176, + "num_tokens": 732783184.0, + "step": 19202 + }, + { + "epoch": 2.4428189797735658, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.978424072265625, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8596267700195312, + "num_tokens": 732818664.0, + "step": 19203 + }, + { + "epoch": 2.4429461900521563, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.337369918823242, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8565911054611206, + "num_tokens": 732854286.0, + "step": 19204 + }, + { + "epoch": 2.443073400330747, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8476159572601318, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8719820976257324, + "num_tokens": 732893032.0, + "step": 19205 + }, + { + "epoch": 2.4432006106093374, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9607716798782349, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8650939464569092, + "num_tokens": 732929942.0, + "step": 19206 + }, + { + "epoch": 2.443327820887928, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9380416870117188, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8601620197296143, + "num_tokens": 732968218.0, + "step": 19207 + }, + { + "epoch": 2.4434550311665184, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8386238813400269, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8891719579696655, + "num_tokens": 733001464.0, + "step": 19208 + }, + { + "epoch": 2.443582241445109, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.884830117225647, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8571221828460693, + "num_tokens": 733036311.0, + "step": 19209 + }, + { + "epoch": 2.4437094517236995, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9583531618118286, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8720076680183411, + "num_tokens": 733070487.0, + "step": 19210 + }, + { + "epoch": 2.4438366620022896, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7987936735153198, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8646996021270752, + "num_tokens": 733110357.0, + "step": 19211 + }, + { + "epoch": 2.4439638722808805, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.080148696899414, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8578547835350037, + "num_tokens": 733149070.0, + "step": 19212 + }, + { + "epoch": 2.4440910825594706, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9069790840148926, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8813861012458801, + "num_tokens": 733180848.0, + "step": 19213 + }, + { + "epoch": 2.444218292838061, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.075059175491333, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8665037751197815, + "num_tokens": 733219204.0, + "step": 19214 + }, + { + "epoch": 2.4443455031166517, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0332229137420654, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8548331260681152, + "num_tokens": 733255237.0, + "step": 19215 + }, + { + "epoch": 2.444472713395242, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8586236238479614, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8774275779724121, + "num_tokens": 733293243.0, + "step": 19216 + }, + { + "epoch": 2.4445999236738327, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.934073567390442, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8677645325660706, + "num_tokens": 733333530.0, + "step": 19217 + }, + { + "epoch": 2.4447271339524232, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8842847347259521, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8480701446533203, + "num_tokens": 733373790.0, + "step": 19218 + }, + { + "epoch": 2.4448543442310138, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8489232063293457, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8672266006469727, + "num_tokens": 733416037.0, + "step": 19219 + }, + { + "epoch": 2.4449815545096043, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9034852981567383, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8610448241233826, + "num_tokens": 733453014.0, + "step": 19220 + }, + { + "epoch": 2.445108764788195, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8989946842193604, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8520064353942871, + "num_tokens": 733491197.0, + "step": 19221 + }, + { + "epoch": 2.4452359750667854, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.881667137145996, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8787089586257935, + "num_tokens": 733531142.0, + "step": 19222 + }, + { + "epoch": 2.445363185345376, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7768791913986206, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8587974309921265, + "num_tokens": 733573916.0, + "step": 19223 + }, + { + "epoch": 2.4454903956239664, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.968955397605896, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8596817255020142, + "num_tokens": 733613204.0, + "step": 19224 + }, + { + "epoch": 2.445617605902557, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8211692571640015, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8606002330780029, + "num_tokens": 733653618.0, + "step": 19225 + }, + { + "epoch": 2.4457448161811475, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8662254810333252, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8818592429161072, + "num_tokens": 733687795.0, + "step": 19226 + }, + { + "epoch": 2.445872026459738, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0061850547790527, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8641494512557983, + "num_tokens": 733723574.0, + "step": 19227 + }, + { + "epoch": 2.4459992367383285, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8384346961975098, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.873383641242981, + "num_tokens": 733762051.0, + "step": 19228 + }, + { + "epoch": 2.446126447016919, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.800497055053711, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8819139003753662, + "num_tokens": 733800640.0, + "step": 19229 + }, + { + "epoch": 2.4462536572955096, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8434228897094727, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8543856143951416, + "num_tokens": 733841321.0, + "step": 19230 + }, + { + "epoch": 2.4463808675741, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0421230792999268, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8671014308929443, + "num_tokens": 733874104.0, + "step": 19231 + }, + { + "epoch": 2.4465080778526906, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0263047218322754, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8707553744316101, + "num_tokens": 733913127.0, + "step": 19232 + }, + { + "epoch": 2.446635288131281, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.885812520980835, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8827011585235596, + "num_tokens": 733951900.0, + "step": 19233 + }, + { + "epoch": 2.4467624984098717, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9056977033615112, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.867669403553009, + "num_tokens": 733992866.0, + "step": 19234 + }, + { + "epoch": 2.4468897086884622, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9939135313034058, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8608220815658569, + "num_tokens": 734029895.0, + "step": 19235 + }, + { + "epoch": 2.4470169189670523, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.894180417060852, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8817234039306641, + "num_tokens": 734066954.0, + "step": 19236 + }, + { + "epoch": 2.4471441292456433, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.159259080886841, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8871430158615112, + "num_tokens": 734102412.0, + "step": 19237 + }, + { + "epoch": 2.4472713395242334, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8352606296539307, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.882106363773346, + "num_tokens": 734141400.0, + "step": 19238 + }, + { + "epoch": 2.447398549802824, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.057051181793213, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8447834253311157, + "num_tokens": 734176628.0, + "step": 19239 + }, + { + "epoch": 2.4475257600814144, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8467882871627808, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8716813325881958, + "num_tokens": 734219756.0, + "step": 19240 + }, + { + "epoch": 2.447652970360005, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1752867698669434, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8646361231803894, + "num_tokens": 734250509.0, + "step": 19241 + }, + { + "epoch": 2.4477801806385955, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7876681089401245, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8716055154800415, + "num_tokens": 734287887.0, + "step": 19242 + }, + { + "epoch": 2.447907390917186, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8030812740325928, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8714637756347656, + "num_tokens": 734330446.0, + "step": 19243 + }, + { + "epoch": 2.4480346011957765, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7309141159057617, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8723517656326294, + "num_tokens": 734368427.0, + "step": 19244 + }, + { + "epoch": 2.448161811474367, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8355222940444946, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8845963478088379, + "num_tokens": 734405103.0, + "step": 19245 + }, + { + "epoch": 2.4482890217529576, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8814607858657837, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8636552691459656, + "num_tokens": 734448069.0, + "step": 19246 + }, + { + "epoch": 2.448416232031548, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.987051010131836, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8524550795555115, + "num_tokens": 734486016.0, + "step": 19247 + }, + { + "epoch": 2.4485434423101387, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9176881313323975, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8544824719429016, + "num_tokens": 734526943.0, + "step": 19248 + }, + { + "epoch": 2.448670652588729, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.766167163848877, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8776988983154297, + "num_tokens": 734566137.0, + "step": 19249 + }, + { + "epoch": 2.4487978628673197, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2193641662597656, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8638116121292114, + "num_tokens": 734602366.0, + "step": 19250 + }, + { + "epoch": 2.4489250731459102, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9128079414367676, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8743929266929626, + "num_tokens": 734644251.0, + "step": 19251 + }, + { + "epoch": 2.4490522834245008, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9749538898468018, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8627567291259766, + "num_tokens": 734682938.0, + "step": 19252 + }, + { + "epoch": 2.4491794937030913, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8054648637771606, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8666472434997559, + "num_tokens": 734722975.0, + "step": 19253 + }, + { + "epoch": 2.449306703981682, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8585513830184937, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8681095838546753, + "num_tokens": 734761361.0, + "step": 19254 + }, + { + "epoch": 2.4494339142602723, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8593415021896362, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8633455038070679, + "num_tokens": 734800597.0, + "step": 19255 + }, + { + "epoch": 2.449561124538863, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.188215494155884, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8540345430374146, + "num_tokens": 734840575.0, + "step": 19256 + }, + { + "epoch": 2.4496883348174534, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.066164255142212, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8571335077285767, + "num_tokens": 734876943.0, + "step": 19257 + }, + { + "epoch": 2.449815545096044, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7554221153259277, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8810038566589355, + "num_tokens": 734916436.0, + "step": 19258 + }, + { + "epoch": 2.4499427553746345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8998905420303345, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8655940294265747, + "num_tokens": 734956336.0, + "step": 19259 + }, + { + "epoch": 2.450069965653225, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.811522126197815, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8643200993537903, + "num_tokens": 734997132.0, + "step": 19260 + }, + { + "epoch": 2.450197175931815, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8490859270095825, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8537204265594482, + "num_tokens": 735036192.0, + "step": 19261 + }, + { + "epoch": 2.450324386210406, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6648451089859009, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8792909383773804, + "num_tokens": 735079368.0, + "step": 19262 + }, + { + "epoch": 2.450451596488996, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.03749418258667, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8714749813079834, + "num_tokens": 735109714.0, + "step": 19263 + }, + { + "epoch": 2.4505788067675867, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0613086223602295, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8629320859909058, + "num_tokens": 735152712.0, + "step": 19264 + }, + { + "epoch": 2.450706017046177, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8388291597366333, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8607929944992065, + "num_tokens": 735193313.0, + "step": 19265 + }, + { + "epoch": 2.4508332273247677, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6730180978775024, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8807179927825928, + "num_tokens": 735235413.0, + "step": 19266 + }, + { + "epoch": 2.4509604376033582, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1793673038482666, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8668639659881592, + "num_tokens": 735270809.0, + "step": 19267 + }, + { + "epoch": 2.4510876478819488, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8701272010803223, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8586938381195068, + "num_tokens": 735309790.0, + "step": 19268 + }, + { + "epoch": 2.4512148581605393, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.377877950668335, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8665808439254761, + "num_tokens": 735341084.0, + "step": 19269 + }, + { + "epoch": 2.45134206843913, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0414509773254395, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8818258047103882, + "num_tokens": 735376964.0, + "step": 19270 + }, + { + "epoch": 2.4514692787177204, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8852578401565552, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8756552934646606, + "num_tokens": 735413296.0, + "step": 19271 + }, + { + "epoch": 2.451596488996311, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6995158195495605, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8708485960960388, + "num_tokens": 735454992.0, + "step": 19272 + }, + { + "epoch": 2.4517236992749014, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.859121561050415, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8674014210700989, + "num_tokens": 735493315.0, + "step": 19273 + }, + { + "epoch": 2.451850909553492, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7366567850112915, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8758952617645264, + "num_tokens": 735532666.0, + "step": 19274 + }, + { + "epoch": 2.4519781198320825, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8144181966781616, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8641718626022339, + "num_tokens": 735571987.0, + "step": 19275 + }, + { + "epoch": 2.452105330110673, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0162546634674072, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8644829988479614, + "num_tokens": 735609409.0, + "step": 19276 + }, + { + "epoch": 2.4522325403892635, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9823732376098633, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8392175436019897, + "num_tokens": 735647299.0, + "step": 19277 + }, + { + "epoch": 2.452359750667854, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.640214681625366, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8740237951278687, + "num_tokens": 735677757.0, + "step": 19278 + }, + { + "epoch": 2.4524869609464446, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.468442916870117, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8715071678161621, + "num_tokens": 735723924.0, + "step": 19279 + }, + { + "epoch": 2.452614171225035, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2083792686462402, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8504236340522766, + "num_tokens": 735758105.0, + "step": 19280 + }, + { + "epoch": 2.4527413815036256, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8521190881729126, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8632780909538269, + "num_tokens": 735797603.0, + "step": 19281 + }, + { + "epoch": 2.452868591782216, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.951193928718567, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8396574854850769, + "num_tokens": 735838958.0, + "step": 19282 + }, + { + "epoch": 2.4529958020608067, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8440721035003662, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8582868576049805, + "num_tokens": 735878894.0, + "step": 19283 + }, + { + "epoch": 2.453123012339397, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7964787483215332, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8956711292266846, + "num_tokens": 735913222.0, + "step": 19284 + }, + { + "epoch": 2.4532502226179878, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.8555705547332764, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8731110692024231, + "num_tokens": 735951884.0, + "step": 19285 + }, + { + "epoch": 2.453377432896578, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.796235203742981, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8621797561645508, + "num_tokens": 735998955.0, + "step": 19286 + }, + { + "epoch": 2.4535046431751684, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0256338119506836, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8651501536369324, + "num_tokens": 736039631.0, + "step": 19287 + }, + { + "epoch": 2.453631853453759, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7165815830230713, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8624364137649536, + "num_tokens": 736085100.0, + "step": 19288 + }, + { + "epoch": 2.4537590637323494, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.834676742553711, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8822992444038391, + "num_tokens": 736119680.0, + "step": 19289 + }, + { + "epoch": 2.45388627401094, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7110810279846191, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8769707083702087, + "num_tokens": 736159769.0, + "step": 19290 + }, + { + "epoch": 2.4540134842895305, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.778051495552063, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.872923731803894, + "num_tokens": 736199566.0, + "step": 19291 + }, + { + "epoch": 2.454140694568121, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9538902044296265, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.870627224445343, + "num_tokens": 736239238.0, + "step": 19292 + }, + { + "epoch": 2.4542679048467115, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8162949085235596, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8702917098999023, + "num_tokens": 736274576.0, + "step": 19293 + }, + { + "epoch": 2.454395115125302, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.948681354522705, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8583531975746155, + "num_tokens": 736307509.0, + "step": 19294 + }, + { + "epoch": 2.4545223254038926, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8326692581176758, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8632305860519409, + "num_tokens": 736348313.0, + "step": 19295 + }, + { + "epoch": 2.454649535682483, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.122575044631958, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8627954721450806, + "num_tokens": 736380716.0, + "step": 19296 + }, + { + "epoch": 2.4547767459610736, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9277708530426025, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8454971313476562, + "num_tokens": 736422669.0, + "step": 19297 + }, + { + "epoch": 2.454903956239664, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.074847936630249, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8742169141769409, + "num_tokens": 736452929.0, + "step": 19298 + }, + { + "epoch": 2.4550311665182547, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9872746467590332, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8680720329284668, + "num_tokens": 736486755.0, + "step": 19299 + }, + { + "epoch": 2.4551583767968452, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.085080146789551, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8741865158081055, + "num_tokens": 736521542.0, + "step": 19300 + }, + { + "epoch": 2.4552855870754358, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9074006080627441, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8666337728500366, + "num_tokens": 736557312.0, + "step": 19301 + }, + { + "epoch": 2.4554127973540263, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8989514112472534, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8615990877151489, + "num_tokens": 736596469.0, + "step": 19302 + }, + { + "epoch": 2.455540007632617, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9467785358428955, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8510090112686157, + "num_tokens": 736636078.0, + "step": 19303 + }, + { + "epoch": 2.4556672179112073, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8968733549118042, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8801122307777405, + "num_tokens": 736674117.0, + "step": 19304 + }, + { + "epoch": 2.455794428189798, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0483827590942383, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8612923622131348, + "num_tokens": 736709116.0, + "step": 19305 + }, + { + "epoch": 2.4559216384683884, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9613877534866333, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8592633605003357, + "num_tokens": 736750383.0, + "step": 19306 + }, + { + "epoch": 2.456048848746979, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0481629371643066, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8542576432228088, + "num_tokens": 736790836.0, + "step": 19307 + }, + { + "epoch": 2.4561760590255695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3899264335632324, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8631142377853394, + "num_tokens": 736826085.0, + "step": 19308 + }, + { + "epoch": 2.4563032693041595, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9778519868850708, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8646029829978943, + "num_tokens": 736860599.0, + "step": 19309 + }, + { + "epoch": 2.4564304795827505, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.110008955001831, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8829639554023743, + "num_tokens": 736893286.0, + "step": 19310 + }, + { + "epoch": 2.4565576898613406, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7376444339752197, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8757470846176147, + "num_tokens": 736931597.0, + "step": 19311 + }, + { + "epoch": 2.456684900139931, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7909584045410156, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8737847805023193, + "num_tokens": 736972183.0, + "step": 19312 + }, + { + "epoch": 2.4568121104185217, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.009711265563965, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.872260570526123, + "num_tokens": 737004184.0, + "step": 19313 + }, + { + "epoch": 2.456939320697112, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.956817865371704, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8658343553543091, + "num_tokens": 737042644.0, + "step": 19314 + }, + { + "epoch": 2.4570665309757027, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0599617958068848, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8560079336166382, + "num_tokens": 737074830.0, + "step": 19315 + }, + { + "epoch": 2.4571937412542932, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0100514888763428, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8733880519866943, + "num_tokens": 737107756.0, + "step": 19316 + }, + { + "epoch": 2.4573209515328838, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.946913480758667, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8748568296432495, + "num_tokens": 737143457.0, + "step": 19317 + }, + { + "epoch": 2.4574481618114743, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9842718839645386, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8617469668388367, + "num_tokens": 737177005.0, + "step": 19318 + }, + { + "epoch": 2.457575372090065, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8503546714782715, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8567652106285095, + "num_tokens": 737218265.0, + "step": 19319 + }, + { + "epoch": 2.4577025823686554, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3271243572235107, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8561003804206848, + "num_tokens": 737259495.0, + "step": 19320 + }, + { + "epoch": 2.457829792647246, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9015458822250366, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8586812019348145, + "num_tokens": 737300073.0, + "step": 19321 + }, + { + "epoch": 2.4579570029258364, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9221248626708984, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.866287350654602, + "num_tokens": 737334716.0, + "step": 19322 + }, + { + "epoch": 2.458084213204427, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7447658777236938, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8717268705368042, + "num_tokens": 737377086.0, + "step": 19323 + }, + { + "epoch": 2.4582114234830175, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7901097536087036, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8558729887008667, + "num_tokens": 737418259.0, + "step": 19324 + }, + { + "epoch": 2.458338633761608, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1401846408843994, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8557413220405579, + "num_tokens": 737453150.0, + "step": 19325 + }, + { + "epoch": 2.4584658440401985, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8660693168640137, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8817628622055054, + "num_tokens": 737495552.0, + "step": 19326 + }, + { + "epoch": 2.458593054318789, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.963042140007019, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8717517852783203, + "num_tokens": 737537518.0, + "step": 19327 + }, + { + "epoch": 2.4587202645973796, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8651715517044067, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8632043600082397, + "num_tokens": 737577472.0, + "step": 19328 + }, + { + "epoch": 2.45884747487597, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7565606832504272, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8586214780807495, + "num_tokens": 737617250.0, + "step": 19329 + }, + { + "epoch": 2.4589746851545606, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7474946975708008, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8699051737785339, + "num_tokens": 737655544.0, + "step": 19330 + }, + { + "epoch": 2.459101895433151, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8793644905090332, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.869870126247406, + "num_tokens": 737688644.0, + "step": 19331 + }, + { + "epoch": 2.4592291057117417, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8471484184265137, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8697264194488525, + "num_tokens": 737730264.0, + "step": 19332 + }, + { + "epoch": 2.459356315990332, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.040942907333374, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8609136343002319, + "num_tokens": 737763578.0, + "step": 19333 + }, + { + "epoch": 2.4594835262689223, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.878007173538208, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8682810664176941, + "num_tokens": 737804062.0, + "step": 19334 + }, + { + "epoch": 2.4596107365475133, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.708702564239502, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8816626667976379, + "num_tokens": 737845608.0, + "step": 19335 + }, + { + "epoch": 2.4597379468261034, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9220670461654663, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8484480977058411, + "num_tokens": 737881366.0, + "step": 19336 + }, + { + "epoch": 2.459865157104694, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8586888313293457, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8587802052497864, + "num_tokens": 737922595.0, + "step": 19337 + }, + { + "epoch": 2.4599923673832844, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9750076532363892, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8728172779083252, + "num_tokens": 737956368.0, + "step": 19338 + }, + { + "epoch": 2.460119577661875, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8189618587493896, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.872805118560791, + "num_tokens": 738000167.0, + "step": 19339 + }, + { + "epoch": 2.4602467879404655, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9417479038238525, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8597937822341919, + "num_tokens": 738040850.0, + "step": 19340 + }, + { + "epoch": 2.460373998219056, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 9.662391662597656, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8646540641784668, + "num_tokens": 738080370.0, + "step": 19341 + }, + { + "epoch": 2.4605012084976465, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9303122758865356, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8704689145088196, + "num_tokens": 738116751.0, + "step": 19342 + }, + { + "epoch": 2.460628418776237, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9841171503067017, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8839614391326904, + "num_tokens": 738154597.0, + "step": 19343 + }, + { + "epoch": 2.4607556290548276, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9408018589019775, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8738107681274414, + "num_tokens": 738186484.0, + "step": 19344 + }, + { + "epoch": 2.460882839333418, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0831427574157715, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8713311553001404, + "num_tokens": 738217722.0, + "step": 19345 + }, + { + "epoch": 2.4610100496120086, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8607425689697266, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8805259466171265, + "num_tokens": 738255304.0, + "step": 19346 + }, + { + "epoch": 2.461137259890599, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9505972862243652, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8625361919403076, + "num_tokens": 738292162.0, + "step": 19347 + }, + { + "epoch": 2.4612644701691897, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7581517696380615, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8786613941192627, + "num_tokens": 738332851.0, + "step": 19348 + }, + { + "epoch": 2.4613916804477802, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7403156757354736, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8773016929626465, + "num_tokens": 738375120.0, + "step": 19349 + }, + { + "epoch": 2.4615188907263708, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7437899112701416, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8581573963165283, + "num_tokens": 738413857.0, + "step": 19350 + }, + { + "epoch": 2.4616461010049613, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8932799100875854, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8604037761688232, + "num_tokens": 738450097.0, + "step": 19351 + }, + { + "epoch": 2.461773311283552, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9504178762435913, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8486328125, + "num_tokens": 738483784.0, + "step": 19352 + }, + { + "epoch": 2.4619005215621423, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7798765897750854, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8475051522254944, + "num_tokens": 738526820.0, + "step": 19353 + }, + { + "epoch": 2.462027731840733, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8824114799499512, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8519797921180725, + "num_tokens": 738569049.0, + "step": 19354 + }, + { + "epoch": 2.4621549421193234, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9041639566421509, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8641345500946045, + "num_tokens": 738608320.0, + "step": 19355 + }, + { + "epoch": 2.462282152397914, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8400354385375977, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.88055819272995, + "num_tokens": 738645566.0, + "step": 19356 + }, + { + "epoch": 2.4624093626765045, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.025424003601074, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8435525894165039, + "num_tokens": 738686184.0, + "step": 19357 + }, + { + "epoch": 2.462536572955095, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.855517864227295, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8518165349960327, + "num_tokens": 738728820.0, + "step": 19358 + }, + { + "epoch": 2.462663783233685, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1422319412231445, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8368632793426514, + "num_tokens": 738760180.0, + "step": 19359 + }, + { + "epoch": 2.462790993512276, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0191471576690674, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8732709884643555, + "num_tokens": 738795501.0, + "step": 19360 + }, + { + "epoch": 2.462918203790866, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9686992168426514, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.856169581413269, + "num_tokens": 738831027.0, + "step": 19361 + }, + { + "epoch": 2.4630454140694567, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8873251676559448, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8481036424636841, + "num_tokens": 738870383.0, + "step": 19362 + }, + { + "epoch": 2.463172624348047, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9445431232452393, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.870974600315094, + "num_tokens": 738908231.0, + "step": 19363 + }, + { + "epoch": 2.4632998346266377, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.838789939880371, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8567989468574524, + "num_tokens": 738950927.0, + "step": 19364 + }, + { + "epoch": 2.4634270449052282, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9344916343688965, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8773122429847717, + "num_tokens": 738989857.0, + "step": 19365 + }, + { + "epoch": 2.4635542551838188, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9523239135742188, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8793693780899048, + "num_tokens": 739021281.0, + "step": 19366 + }, + { + "epoch": 2.4636814654624093, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7739720344543457, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8731191158294678, + "num_tokens": 739063475.0, + "step": 19367 + }, + { + "epoch": 2.463808675741, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9384204149246216, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8611478805541992, + "num_tokens": 739107083.0, + "step": 19368 + }, + { + "epoch": 2.4639358860195903, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7764620780944824, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8449698686599731, + "num_tokens": 739152002.0, + "step": 19369 + }, + { + "epoch": 2.464063096298181, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8954600095748901, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8628214597702026, + "num_tokens": 739191781.0, + "step": 19370 + }, + { + "epoch": 2.4641903065767714, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8826515674591064, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8885300159454346, + "num_tokens": 739230374.0, + "step": 19371 + }, + { + "epoch": 2.464317516855362, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9508692026138306, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8608181476593018, + "num_tokens": 739267310.0, + "step": 19372 + }, + { + "epoch": 2.4644447271339525, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.872353434562683, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8694361448287964, + "num_tokens": 739305074.0, + "step": 19373 + }, + { + "epoch": 2.464571937412543, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9124757051467896, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8571116328239441, + "num_tokens": 739343028.0, + "step": 19374 + }, + { + "epoch": 2.4646991476911335, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.973321795463562, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8649480938911438, + "num_tokens": 739384364.0, + "step": 19375 + }, + { + "epoch": 2.464826357969724, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8271541595458984, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8813978433609009, + "num_tokens": 739423564.0, + "step": 19376 + }, + { + "epoch": 2.4649535682483146, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8382478952407837, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.875940203666687, + "num_tokens": 739457609.0, + "step": 19377 + }, + { + "epoch": 2.465080778526905, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.87449049949646, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8666223287582397, + "num_tokens": 739497204.0, + "step": 19378 + }, + { + "epoch": 2.4652079888054956, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8020474910736084, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8618886470794678, + "num_tokens": 739543688.0, + "step": 19379 + }, + { + "epoch": 2.465335199084086, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.082609176635742, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8768446445465088, + "num_tokens": 739578221.0, + "step": 19380 + }, + { + "epoch": 2.4654624093626767, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9244716167449951, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8708338141441345, + "num_tokens": 739615892.0, + "step": 19381 + }, + { + "epoch": 2.4655896196412668, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.830705165863037, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8516712784767151, + "num_tokens": 739657697.0, + "step": 19382 + }, + { + "epoch": 2.4657168299198577, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.948848843574524, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8756840229034424, + "num_tokens": 739694173.0, + "step": 19383 + }, + { + "epoch": 2.465844040198448, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8207392692565918, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8640509843826294, + "num_tokens": 739734111.0, + "step": 19384 + }, + { + "epoch": 2.4659712504770384, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0455079078674316, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.866554856300354, + "num_tokens": 739771409.0, + "step": 19385 + }, + { + "epoch": 2.466098460755629, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9467229843139648, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8517154455184937, + "num_tokens": 739807185.0, + "step": 19386 + }, + { + "epoch": 2.4662256710342194, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2453560829162598, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8770186901092529, + "num_tokens": 739837750.0, + "step": 19387 + }, + { + "epoch": 2.46635288131281, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9353190660476685, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8652505874633789, + "num_tokens": 739879516.0, + "step": 19388 + }, + { + "epoch": 2.4664800915914005, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8312736749649048, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8728981018066406, + "num_tokens": 739918688.0, + "step": 19389 + }, + { + "epoch": 2.466607301869991, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8324428796768188, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8701611161231995, + "num_tokens": 739958978.0, + "step": 19390 + }, + { + "epoch": 2.4667345121485815, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7759771347045898, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8700838088989258, + "num_tokens": 740002428.0, + "step": 19391 + }, + { + "epoch": 2.466861722427172, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6430963277816772, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8859688639640808, + "num_tokens": 740045856.0, + "step": 19392 + }, + { + "epoch": 2.4669889327057626, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9152864217758179, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8639271259307861, + "num_tokens": 740084512.0, + "step": 19393 + }, + { + "epoch": 2.467116142984353, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9861994981765747, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8774347305297852, + "num_tokens": 740129624.0, + "step": 19394 + }, + { + "epoch": 2.4672433532629436, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.023500919342041, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8491114974021912, + "num_tokens": 740168368.0, + "step": 19395 + }, + { + "epoch": 2.467370563541534, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4729175567626953, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8587131500244141, + "num_tokens": 740206670.0, + "step": 19396 + }, + { + "epoch": 2.4674977738201247, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9898173809051514, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8427489399909973, + "num_tokens": 740246422.0, + "step": 19397 + }, + { + "epoch": 2.4676249840987152, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9598551988601685, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8636705875396729, + "num_tokens": 740284450.0, + "step": 19398 + }, + { + "epoch": 2.4677521943773058, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9596070051193237, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.843515157699585, + "num_tokens": 740319062.0, + "step": 19399 + }, + { + "epoch": 2.4678794046558963, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.909323811531067, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8664627075195312, + "num_tokens": 740353996.0, + "step": 19400 + }, + { + "epoch": 2.468006614934487, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8404200077056885, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8646904826164246, + "num_tokens": 740394694.0, + "step": 19401 + }, + { + "epoch": 2.4681338252130773, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6731741428375244, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.886748194694519, + "num_tokens": 740436567.0, + "step": 19402 + }, + { + "epoch": 2.468261035491668, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8447071313858032, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.85727858543396, + "num_tokens": 740476458.0, + "step": 19403 + }, + { + "epoch": 2.4683882457702584, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8002208471298218, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8759356737136841, + "num_tokens": 740514703.0, + "step": 19404 + }, + { + "epoch": 2.468515456048849, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.830505132675171, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8764564990997314, + "num_tokens": 740552328.0, + "step": 19405 + }, + { + "epoch": 2.4686426663274394, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8007287979125977, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8494070172309875, + "num_tokens": 740593141.0, + "step": 19406 + }, + { + "epoch": 2.4687698766060295, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7083898782730103, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8758741617202759, + "num_tokens": 740634097.0, + "step": 19407 + }, + { + "epoch": 2.4688970868846205, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.820729374885559, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.872303307056427, + "num_tokens": 740672917.0, + "step": 19408 + }, + { + "epoch": 2.4690242971632106, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.875030755996704, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8604350090026855, + "num_tokens": 740709351.0, + "step": 19409 + }, + { + "epoch": 2.469151507441801, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.097909927368164, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8569833040237427, + "num_tokens": 740744049.0, + "step": 19410 + }, + { + "epoch": 2.4692787177203916, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.814070463180542, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8687930107116699, + "num_tokens": 740784765.0, + "step": 19411 + }, + { + "epoch": 2.469405927998982, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1687583923339844, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.859387218952179, + "num_tokens": 740815859.0, + "step": 19412 + }, + { + "epoch": 2.4695331382775727, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0105602741241455, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8841077089309692, + "num_tokens": 740854300.0, + "step": 19413 + }, + { + "epoch": 2.4696603485561632, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.104952096939087, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8496482968330383, + "num_tokens": 740891836.0, + "step": 19414 + }, + { + "epoch": 2.4697875588347538, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8390116691589355, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8744617700576782, + "num_tokens": 740927434.0, + "step": 19415 + }, + { + "epoch": 2.4699147691133443, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8590208292007446, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8681131601333618, + "num_tokens": 740973707.0, + "step": 19416 + }, + { + "epoch": 2.470041979391935, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.6671459674835205, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8722487688064575, + "num_tokens": 741015953.0, + "step": 19417 + }, + { + "epoch": 2.4701691896705253, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.034912347793579, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8732660412788391, + "num_tokens": 741049634.0, + "step": 19418 + }, + { + "epoch": 2.470296399949116, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9540926218032837, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8750279545783997, + "num_tokens": 741084267.0, + "step": 19419 + }, + { + "epoch": 2.4704236102277064, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7500581741333008, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8726279735565186, + "num_tokens": 741122237.0, + "step": 19420 + }, + { + "epoch": 2.470550820506297, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8674215078353882, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8536903858184814, + "num_tokens": 741163593.0, + "step": 19421 + }, + { + "epoch": 2.4706780307848875, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.871890902519226, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8558351397514343, + "num_tokens": 741202328.0, + "step": 19422 + }, + { + "epoch": 2.470805241063478, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9098665714263916, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8853002786636353, + "num_tokens": 741240095.0, + "step": 19423 + }, + { + "epoch": 2.4709324513420685, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9681758880615234, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8681395053863525, + "num_tokens": 741278509.0, + "step": 19424 + }, + { + "epoch": 2.471059661620659, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9099249839782715, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8602758049964905, + "num_tokens": 741321446.0, + "step": 19425 + }, + { + "epoch": 2.4711868718992496, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0151174068450928, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8638176321983337, + "num_tokens": 741360469.0, + "step": 19426 + }, + { + "epoch": 2.47131408217784, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.027869462966919, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8551759719848633, + "num_tokens": 741393833.0, + "step": 19427 + }, + { + "epoch": 2.4714412924564306, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.102294445037842, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8697596192359924, + "num_tokens": 741426956.0, + "step": 19428 + }, + { + "epoch": 2.471568502735021, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.791039228439331, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8662749528884888, + "num_tokens": 741464820.0, + "step": 19429 + }, + { + "epoch": 2.4716957130136117, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0145397186279297, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8733127117156982, + "num_tokens": 741499519.0, + "step": 19430 + }, + { + "epoch": 2.471822923292202, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.952932357788086, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8717372417449951, + "num_tokens": 741533443.0, + "step": 19431 + }, + { + "epoch": 2.4719501335707923, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9069538116455078, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8632131814956665, + "num_tokens": 741576849.0, + "step": 19432 + }, + { + "epoch": 2.4720773438493833, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.906059980392456, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8702024221420288, + "num_tokens": 741614653.0, + "step": 19433 + }, + { + "epoch": 2.4722045541279734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9439523220062256, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.861182451248169, + "num_tokens": 741649354.0, + "step": 19434 + }, + { + "epoch": 2.472331764406564, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9241597652435303, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8605657815933228, + "num_tokens": 741692257.0, + "step": 19435 + }, + { + "epoch": 2.4724589746851544, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9617916345596313, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8636344075202942, + "num_tokens": 741729739.0, + "step": 19436 + }, + { + "epoch": 2.472586184963745, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9070380926132202, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8589606881141663, + "num_tokens": 741763409.0, + "step": 19437 + }, + { + "epoch": 2.4727133952423355, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9372329711914062, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8627957105636597, + "num_tokens": 741798746.0, + "step": 19438 + }, + { + "epoch": 2.472840605520926, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6805696487426758, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8751637935638428, + "num_tokens": 741840488.0, + "step": 19439 + }, + { + "epoch": 2.4729678157995165, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8487387895584106, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8844006657600403, + "num_tokens": 741880839.0, + "step": 19440 + }, + { + "epoch": 2.473095026078107, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8162593841552734, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.859809160232544, + "num_tokens": 741919317.0, + "step": 19441 + }, + { + "epoch": 2.4732222363566976, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0038461685180664, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.845212459564209, + "num_tokens": 741955782.0, + "step": 19442 + }, + { + "epoch": 2.473349446635288, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0831074714660645, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8598053455352783, + "num_tokens": 741990410.0, + "step": 19443 + }, + { + "epoch": 2.4734766569138786, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.96318781375885, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8597736954689026, + "num_tokens": 742031536.0, + "step": 19444 + }, + { + "epoch": 2.473603867192469, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8273236751556396, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8539649248123169, + "num_tokens": 742067222.0, + "step": 19445 + }, + { + "epoch": 2.4737310774710597, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7295762300491333, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8774453401565552, + "num_tokens": 742114740.0, + "step": 19446 + }, + { + "epoch": 2.47385828774965, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9683568477630615, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.871852457523346, + "num_tokens": 742149344.0, + "step": 19447 + }, + { + "epoch": 2.4739854980282407, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9023114442825317, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8675720691680908, + "num_tokens": 742191089.0, + "step": 19448 + }, + { + "epoch": 2.4741127083068313, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.035109519958496, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8660157918930054, + "num_tokens": 742224944.0, + "step": 19449 + }, + { + "epoch": 2.474239918585422, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9959384202957153, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8560841083526611, + "num_tokens": 742263674.0, + "step": 19450 + }, + { + "epoch": 2.4743671288640123, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9638360738754272, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8618486523628235, + "num_tokens": 742300424.0, + "step": 19451 + }, + { + "epoch": 2.474494339142603, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9161566495895386, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.87565016746521, + "num_tokens": 742333854.0, + "step": 19452 + }, + { + "epoch": 2.4746215494211934, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8032302856445312, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.874517560005188, + "num_tokens": 742377126.0, + "step": 19453 + }, + { + "epoch": 2.474748759699784, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8113977909088135, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8644480109214783, + "num_tokens": 742421284.0, + "step": 19454 + }, + { + "epoch": 2.4748759699783744, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2638416290283203, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8480714559555054, + "num_tokens": 742459124.0, + "step": 19455 + }, + { + "epoch": 2.475003180256965, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.989512324333191, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8583444356918335, + "num_tokens": 742493137.0, + "step": 19456 + }, + { + "epoch": 2.475130390535555, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1462290287017822, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8678681254386902, + "num_tokens": 742529805.0, + "step": 19457 + }, + { + "epoch": 2.475257600814146, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0472922325134277, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8787193894386292, + "num_tokens": 742560859.0, + "step": 19458 + }, + { + "epoch": 2.475384811092736, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.74639093875885, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8786802887916565, + "num_tokens": 742601465.0, + "step": 19459 + }, + { + "epoch": 2.4755120213713266, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.960935115814209, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8591852188110352, + "num_tokens": 742639242.0, + "step": 19460 + }, + { + "epoch": 2.475639231649917, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0070292949676514, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8659518957138062, + "num_tokens": 742676574.0, + "step": 19461 + }, + { + "epoch": 2.4757664419285077, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2314658164978027, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8559704422950745, + "num_tokens": 742713326.0, + "step": 19462 + }, + { + "epoch": 2.4758936522070982, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8248751163482666, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8424245715141296, + "num_tokens": 742756883.0, + "step": 19463 + }, + { + "epoch": 2.4760208624856888, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9781960248947144, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8609709143638611, + "num_tokens": 742794301.0, + "step": 19464 + }, + { + "epoch": 2.4761480727642793, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9480385780334473, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8675283193588257, + "num_tokens": 742832966.0, + "step": 19465 + }, + { + "epoch": 2.47627528304287, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.128183364868164, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8622843623161316, + "num_tokens": 742872926.0, + "step": 19466 + }, + { + "epoch": 2.4764024933214603, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9670584201812744, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8719293475151062, + "num_tokens": 742910838.0, + "step": 19467 + }, + { + "epoch": 2.476529703600051, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8998006582260132, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8735185265541077, + "num_tokens": 742950098.0, + "step": 19468 + }, + { + "epoch": 2.4766569138786414, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7075074911117554, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.877066969871521, + "num_tokens": 742983872.0, + "step": 19469 + }, + { + "epoch": 2.476784124157232, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.668556809425354, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.855204164981842, + "num_tokens": 743029816.0, + "step": 19470 + }, + { + "epoch": 2.4769113344358225, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.996774435043335, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8573818206787109, + "num_tokens": 743068565.0, + "step": 19471 + }, + { + "epoch": 2.477038544714413, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8319655656814575, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.86815345287323, + "num_tokens": 743112137.0, + "step": 19472 + }, + { + "epoch": 2.4771657549930035, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0795974731445312, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8784462213516235, + "num_tokens": 743145067.0, + "step": 19473 + }, + { + "epoch": 2.477292965271594, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1753432750701904, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8710957169532776, + "num_tokens": 743177017.0, + "step": 19474 + }, + { + "epoch": 2.4774201755501846, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7972755432128906, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8661647439002991, + "num_tokens": 743218121.0, + "step": 19475 + }, + { + "epoch": 2.477547385828775, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.893203854560852, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8710077404975891, + "num_tokens": 743256635.0, + "step": 19476 + }, + { + "epoch": 2.4776745961073656, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9714909791946411, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8627135753631592, + "num_tokens": 743293801.0, + "step": 19477 + }, + { + "epoch": 2.477801806385956, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.883162498474121, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.880831778049469, + "num_tokens": 743331582.0, + "step": 19478 + }, + { + "epoch": 2.4779290166645467, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9458039999008179, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8680044412612915, + "num_tokens": 743369901.0, + "step": 19479 + }, + { + "epoch": 2.4780562269431368, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.916780948638916, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8598679900169373, + "num_tokens": 743405808.0, + "step": 19480 + }, + { + "epoch": 2.4781834372217277, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7861906290054321, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8688631057739258, + "num_tokens": 743447236.0, + "step": 19481 + }, + { + "epoch": 2.478310647500318, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9286353588104248, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8723533153533936, + "num_tokens": 743486494.0, + "step": 19482 + }, + { + "epoch": 2.4784378577789083, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8971967697143555, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.863496720790863, + "num_tokens": 743523941.0, + "step": 19483 + }, + { + "epoch": 2.478565068057499, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0422441959381104, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8684040307998657, + "num_tokens": 743554169.0, + "step": 19484 + }, + { + "epoch": 2.4786922783360894, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9362497329711914, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8509650230407715, + "num_tokens": 743594656.0, + "step": 19485 + }, + { + "epoch": 2.47881948861468, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.017972707748413, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8735584020614624, + "num_tokens": 743628638.0, + "step": 19486 + }, + { + "epoch": 2.4789466988932705, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.043163537979126, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8580559492111206, + "num_tokens": 743665382.0, + "step": 19487 + }, + { + "epoch": 2.479073909171861, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.951392650604248, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8793578147888184, + "num_tokens": 743701712.0, + "step": 19488 + }, + { + "epoch": 2.4792011194504515, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9546622037887573, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8659540414810181, + "num_tokens": 743740347.0, + "step": 19489 + }, + { + "epoch": 2.479328329729042, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9892654418945312, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8676900863647461, + "num_tokens": 743775434.0, + "step": 19490 + }, + { + "epoch": 2.4794555400076326, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.852466344833374, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8647509217262268, + "num_tokens": 743812475.0, + "step": 19491 + }, + { + "epoch": 2.479582750286223, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 20.458168029785156, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8576259613037109, + "num_tokens": 743851350.0, + "step": 19492 + }, + { + "epoch": 2.4797099605648136, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1242151260375977, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8612996935844421, + "num_tokens": 743885086.0, + "step": 19493 + }, + { + "epoch": 2.479837170843404, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9795349836349487, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8688632249832153, + "num_tokens": 743931747.0, + "step": 19494 + }, + { + "epoch": 2.4799643811219947, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7861437797546387, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8784000277519226, + "num_tokens": 743971806.0, + "step": 19495 + }, + { + "epoch": 2.480091591400585, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 12.726224899291992, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8755605816841125, + "num_tokens": 744005304.0, + "step": 19496 + }, + { + "epoch": 2.4802188016791757, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8212370872497559, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8578550815582275, + "num_tokens": 744051159.0, + "step": 19497 + }, + { + "epoch": 2.4803460119577663, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9372068643569946, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8483406901359558, + "num_tokens": 744089856.0, + "step": 19498 + }, + { + "epoch": 2.480473222236357, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8753641843795776, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8453332185745239, + "num_tokens": 744136355.0, + "step": 19499 + }, + { + "epoch": 2.4806004325149473, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.861151099205017, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8549293279647827, + "num_tokens": 744177682.0, + "step": 19500 + }, + { + "epoch": 2.480727642793538, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.773339033126831, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8676025867462158, + "num_tokens": 744214209.0, + "step": 19501 + }, + { + "epoch": 2.4808548530721284, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7644321918487549, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8491263389587402, + "num_tokens": 744257923.0, + "step": 19502 + }, + { + "epoch": 2.480982063350719, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9967081546783447, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8715572357177734, + "num_tokens": 744293370.0, + "step": 19503 + }, + { + "epoch": 2.4811092736293094, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9223732948303223, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8729608654975891, + "num_tokens": 744328572.0, + "step": 19504 + }, + { + "epoch": 2.4812364839078995, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0527493953704834, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.848915696144104, + "num_tokens": 744364124.0, + "step": 19505 + }, + { + "epoch": 2.4813636941864905, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0131945610046387, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8692823648452759, + "num_tokens": 744403679.0, + "step": 19506 + }, + { + "epoch": 2.4814909044650806, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0859978199005127, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8654410243034363, + "num_tokens": 744440922.0, + "step": 19507 + }, + { + "epoch": 2.481618114743671, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9051746129989624, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8620905876159668, + "num_tokens": 744479025.0, + "step": 19508 + }, + { + "epoch": 2.4817453250222616, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.059237480163574, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8664974570274353, + "num_tokens": 744514165.0, + "step": 19509 + }, + { + "epoch": 2.481872535300852, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9546693563461304, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8607295751571655, + "num_tokens": 744547210.0, + "step": 19510 + }, + { + "epoch": 2.4819997455794427, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9728244543075562, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.865452766418457, + "num_tokens": 744585132.0, + "step": 19511 + }, + { + "epoch": 2.4821269558580332, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9253865480422974, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8770440220832825, + "num_tokens": 744617921.0, + "step": 19512 + }, + { + "epoch": 2.4822541661366238, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.997084140777588, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8619999289512634, + "num_tokens": 744653095.0, + "step": 19513 + }, + { + "epoch": 2.4823813764152143, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.858486533164978, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8709901571273804, + "num_tokens": 744688258.0, + "step": 19514 + }, + { + "epoch": 2.482508586693805, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.079210042953491, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.86504065990448, + "num_tokens": 744723500.0, + "step": 19515 + }, + { + "epoch": 2.4826357969723953, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9772987365722656, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8656167984008789, + "num_tokens": 744757133.0, + "step": 19516 + }, + { + "epoch": 2.482763007250986, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8498250246047974, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8772354125976562, + "num_tokens": 744793814.0, + "step": 19517 + }, + { + "epoch": 2.4828902175295764, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.278604030609131, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8692455291748047, + "num_tokens": 744828680.0, + "step": 19518 + }, + { + "epoch": 2.483017427808167, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.98729133605957, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8615511059761047, + "num_tokens": 744861642.0, + "step": 19519 + }, + { + "epoch": 2.4831446380867574, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9290077686309814, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8773863911628723, + "num_tokens": 744900345.0, + "step": 19520 + }, + { + "epoch": 2.483271848365348, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8978008031845093, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.873356819152832, + "num_tokens": 744938589.0, + "step": 19521 + }, + { + "epoch": 2.4833990586439385, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9517018795013428, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8557357788085938, + "num_tokens": 744975385.0, + "step": 19522 + }, + { + "epoch": 2.483526268922529, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.02481746673584, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8561972379684448, + "num_tokens": 745010860.0, + "step": 19523 + }, + { + "epoch": 2.4836534792011196, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9494231939315796, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8580275774002075, + "num_tokens": 745047183.0, + "step": 19524 + }, + { + "epoch": 2.48378068947971, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8817734718322754, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8625832796096802, + "num_tokens": 745081906.0, + "step": 19525 + }, + { + "epoch": 2.4839078997583006, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6558239459991455, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8710615038871765, + "num_tokens": 745127285.0, + "step": 19526 + }, + { + "epoch": 2.484035110036891, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7603646516799927, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8824115991592407, + "num_tokens": 745171564.0, + "step": 19527 + }, + { + "epoch": 2.4841623203154817, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.881303310394287, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8776350617408752, + "num_tokens": 745214252.0, + "step": 19528 + }, + { + "epoch": 2.484289530594072, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8535263538360596, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8680811524391174, + "num_tokens": 745251983.0, + "step": 19529 + }, + { + "epoch": 2.4844167408726623, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3062846660614014, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8675426244735718, + "num_tokens": 745287037.0, + "step": 19530 + }, + { + "epoch": 2.4845439511512533, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8155837059020996, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8632608652114868, + "num_tokens": 745327854.0, + "step": 19531 + }, + { + "epoch": 2.4846711614298433, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9961977005004883, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8545660972595215, + "num_tokens": 745370227.0, + "step": 19532 + }, + { + "epoch": 2.484798371708434, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8010841608047485, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8641396760940552, + "num_tokens": 745409416.0, + "step": 19533 + }, + { + "epoch": 2.4849255819870244, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.008770227432251, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8461829423904419, + "num_tokens": 745449908.0, + "step": 19534 + }, + { + "epoch": 2.485052792265615, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1165549755096436, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8550587296485901, + "num_tokens": 745486384.0, + "step": 19535 + }, + { + "epoch": 2.4851800025442055, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.889058232307434, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8562502861022949, + "num_tokens": 745523058.0, + "step": 19536 + }, + { + "epoch": 2.485307212822796, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.939543604850769, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8475821018218994, + "num_tokens": 745559276.0, + "step": 19537 + }, + { + "epoch": 2.4854344231013865, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9809527397155762, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8640302419662476, + "num_tokens": 745592726.0, + "step": 19538 + }, + { + "epoch": 2.485561633379977, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8984225988388062, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8673453330993652, + "num_tokens": 745630117.0, + "step": 19539 + }, + { + "epoch": 2.4856888436585676, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9472346305847168, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8520816564559937, + "num_tokens": 745664968.0, + "step": 19540 + }, + { + "epoch": 2.485816053937158, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9206620454788208, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8664871454238892, + "num_tokens": 745701893.0, + "step": 19541 + }, + { + "epoch": 2.4859432642157486, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8949557542800903, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8615694642066956, + "num_tokens": 745736884.0, + "step": 19542 + }, + { + "epoch": 2.486070474494339, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7730920314788818, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8764756321907043, + "num_tokens": 745782381.0, + "step": 19543 + }, + { + "epoch": 2.4861976847729297, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8302909135818481, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8794399499893188, + "num_tokens": 745827630.0, + "step": 19544 + }, + { + "epoch": 2.48632489505152, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9256645441055298, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8726778626441956, + "num_tokens": 745862223.0, + "step": 19545 + }, + { + "epoch": 2.4864521053301107, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9298111200332642, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8819742202758789, + "num_tokens": 745903077.0, + "step": 19546 + }, + { + "epoch": 2.4865793156087013, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8475658893585205, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8668950796127319, + "num_tokens": 745945534.0, + "step": 19547 + }, + { + "epoch": 2.486706525887292, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8880529403686523, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8491325378417969, + "num_tokens": 745986308.0, + "step": 19548 + }, + { + "epoch": 2.4868337361658823, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9887313842773438, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8568969964981079, + "num_tokens": 746022206.0, + "step": 19549 + }, + { + "epoch": 2.486960946444473, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9955193996429443, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8726511001586914, + "num_tokens": 746059854.0, + "step": 19550 + }, + { + "epoch": 2.4870881567230634, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9159482717514038, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8734162449836731, + "num_tokens": 746103994.0, + "step": 19551 + }, + { + "epoch": 2.487215367001654, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9552183151245117, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8561810851097107, + "num_tokens": 746138491.0, + "step": 19552 + }, + { + "epoch": 2.4873425772802444, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9687151908874512, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8642042875289917, + "num_tokens": 746172598.0, + "step": 19553 + }, + { + "epoch": 2.487469787558835, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7983566522598267, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8734902739524841, + "num_tokens": 746210385.0, + "step": 19554 + }, + { + "epoch": 2.487596997837425, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7735629081726074, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8680851459503174, + "num_tokens": 746252563.0, + "step": 19555 + }, + { + "epoch": 2.487724208116016, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7619212865829468, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8721138834953308, + "num_tokens": 746292356.0, + "step": 19556 + }, + { + "epoch": 2.487851418394606, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.039022207260132, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8543928265571594, + "num_tokens": 746325467.0, + "step": 19557 + }, + { + "epoch": 2.4879786286731966, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.859116554260254, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8608751893043518, + "num_tokens": 746362285.0, + "step": 19558 + }, + { + "epoch": 2.488105838951787, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8232802152633667, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8736391663551331, + "num_tokens": 746401540.0, + "step": 19559 + }, + { + "epoch": 2.4882330492303777, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9801523685455322, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8678926825523376, + "num_tokens": 746439282.0, + "step": 19560 + }, + { + "epoch": 2.488360259508968, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7973960638046265, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8784363269805908, + "num_tokens": 746480993.0, + "step": 19561 + }, + { + "epoch": 2.4884874697875587, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8201441764831543, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8632171750068665, + "num_tokens": 746519318.0, + "step": 19562 + }, + { + "epoch": 2.4886146800661493, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9519850015640259, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8519562482833862, + "num_tokens": 746559761.0, + "step": 19563 + }, + { + "epoch": 2.48874189034474, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.923770546913147, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8733336329460144, + "num_tokens": 746591622.0, + "step": 19564 + }, + { + "epoch": 2.4888691006233303, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0552258491516113, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8632035255432129, + "num_tokens": 746627709.0, + "step": 19565 + }, + { + "epoch": 2.488996310901921, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9409276247024536, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8634248971939087, + "num_tokens": 746663445.0, + "step": 19566 + }, + { + "epoch": 2.4891235211805114, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8528681993484497, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8540371656417847, + "num_tokens": 746706869.0, + "step": 19567 + }, + { + "epoch": 2.489250731459102, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.73232102394104, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8773629665374756, + "num_tokens": 746749502.0, + "step": 19568 + }, + { + "epoch": 2.4893779417376924, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2454187870025635, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8701941967010498, + "num_tokens": 746786510.0, + "step": 19569 + }, + { + "epoch": 2.489505152016283, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.05261492729187, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8694576025009155, + "num_tokens": 746819430.0, + "step": 19570 + }, + { + "epoch": 2.4896323622948735, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.003127098083496, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8682031631469727, + "num_tokens": 746856018.0, + "step": 19571 + }, + { + "epoch": 2.489759572573464, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.862692952156067, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8598159551620483, + "num_tokens": 746897350.0, + "step": 19572 + }, + { + "epoch": 2.4898867828520546, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9249720573425293, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8584572076797485, + "num_tokens": 746937163.0, + "step": 19573 + }, + { + "epoch": 2.490013993130645, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8661943674087524, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8743596076965332, + "num_tokens": 746976140.0, + "step": 19574 + }, + { + "epoch": 2.4901412034092356, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.828481912612915, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8651583194732666, + "num_tokens": 747012576.0, + "step": 19575 + }, + { + "epoch": 2.490268413687826, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8902009725570679, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8647884130477905, + "num_tokens": 747048179.0, + "step": 19576 + }, + { + "epoch": 2.4903956239664167, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.046705484390259, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.846914529800415, + "num_tokens": 747085291.0, + "step": 19577 + }, + { + "epoch": 2.4905228342450068, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8979741334915161, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8724064826965332, + "num_tokens": 747119085.0, + "step": 19578 + }, + { + "epoch": 2.4906500445235977, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8034577369689941, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8652353286743164, + "num_tokens": 747153463.0, + "step": 19579 + }, + { + "epoch": 2.490777254802188, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8336070775985718, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8407824039459229, + "num_tokens": 747194781.0, + "step": 19580 + }, + { + "epoch": 2.4909044650807783, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8410416841506958, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8668714761734009, + "num_tokens": 747235328.0, + "step": 19581 + }, + { + "epoch": 2.491031675359369, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.775071144104004, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8625967502593994, + "num_tokens": 747278481.0, + "step": 19582 + }, + { + "epoch": 2.4911588856379594, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9839462041854858, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.880869448184967, + "num_tokens": 747313077.0, + "step": 19583 + }, + { + "epoch": 2.49128609591655, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.767162799835205, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8719301819801331, + "num_tokens": 747352693.0, + "step": 19584 + }, + { + "epoch": 2.4914133061951405, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7312968969345093, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8571143746376038, + "num_tokens": 747397717.0, + "step": 19585 + }, + { + "epoch": 2.491540516473731, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8151198625564575, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8686573505401611, + "num_tokens": 747435457.0, + "step": 19586 + }, + { + "epoch": 2.4916677267523215, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8517687320709229, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.870401918888092, + "num_tokens": 747479017.0, + "step": 19587 + }, + { + "epoch": 2.491794937030912, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.244798183441162, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8645552396774292, + "num_tokens": 747520247.0, + "step": 19588 + }, + { + "epoch": 2.4919221473095026, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9170541763305664, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8509711623191833, + "num_tokens": 747559983.0, + "step": 19589 + }, + { + "epoch": 2.492049357588093, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 80.52428436279297, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.861423134803772, + "num_tokens": 747595746.0, + "step": 19590 + }, + { + "epoch": 2.4921765678666836, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0737545490264893, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8621853590011597, + "num_tokens": 747639261.0, + "step": 19591 + }, + { + "epoch": 2.492303778145274, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.146940231323242, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8473273515701294, + "num_tokens": 747680665.0, + "step": 19592 + }, + { + "epoch": 2.4924309884238647, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0186595916748047, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.869210422039032, + "num_tokens": 747713320.0, + "step": 19593 + }, + { + "epoch": 2.492558198702455, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9835691452026367, + "learning_rate": 1e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8388793468475342, + "num_tokens": 747752167.0, + "step": 19594 + }, + { + "epoch": 2.4926854089810457, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8822591304779053, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8714988231658936, + "num_tokens": 747787274.0, + "step": 19595 + }, + { + "epoch": 2.4928126192596363, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.837099552154541, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8589439392089844, + "num_tokens": 747826647.0, + "step": 19596 + }, + { + "epoch": 2.492939829538227, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8887656927108765, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8614945411682129, + "num_tokens": 747866797.0, + "step": 19597 + }, + { + "epoch": 2.4930670398168173, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.804671287536621, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8670011758804321, + "num_tokens": 747907589.0, + "step": 19598 + }, + { + "epoch": 2.493194250095408, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0892417430877686, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8574368953704834, + "num_tokens": 747939865.0, + "step": 19599 + }, + { + "epoch": 2.4933214603739984, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8512893915176392, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8629096150398254, + "num_tokens": 747976238.0, + "step": 19600 + }, + { + "epoch": 2.493448670652589, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9357168674468994, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8640884757041931, + "num_tokens": 748012076.0, + "step": 19601 + }, + { + "epoch": 2.4935758809311794, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2124714851379395, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8573011159896851, + "num_tokens": 748042518.0, + "step": 19602 + }, + { + "epoch": 2.4937030912097695, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8855671882629395, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8732408285140991, + "num_tokens": 748077509.0, + "step": 19603 + }, + { + "epoch": 2.4938303014883605, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8480122089385986, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8706289529800415, + "num_tokens": 748118164.0, + "step": 19604 + }, + { + "epoch": 2.4939575117669506, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0500667095184326, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8573060035705566, + "num_tokens": 748153609.0, + "step": 19605 + }, + { + "epoch": 2.494084722045541, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8262072801589966, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8635504245758057, + "num_tokens": 748189573.0, + "step": 19606 + }, + { + "epoch": 2.4942119323241316, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7858902215957642, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8603312969207764, + "num_tokens": 748231121.0, + "step": 19607 + }, + { + "epoch": 2.494339142602722, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.829033374786377, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8746527433395386, + "num_tokens": 748268744.0, + "step": 19608 + }, + { + "epoch": 2.4944663528813127, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.960602879524231, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8701290488243103, + "num_tokens": 748307441.0, + "step": 19609 + }, + { + "epoch": 2.494593563159903, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9160100221633911, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.865043044090271, + "num_tokens": 748343506.0, + "step": 19610 + }, + { + "epoch": 2.4947207734384937, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.876955270767212, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8668590784072876, + "num_tokens": 748382209.0, + "step": 19611 + }, + { + "epoch": 2.4948479837170843, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9487426280975342, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8861479759216309, + "num_tokens": 748413660.0, + "step": 19612 + }, + { + "epoch": 2.494975193995675, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8406933546066284, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8664346933364868, + "num_tokens": 748457037.0, + "step": 19613 + }, + { + "epoch": 2.4951024042742653, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0123140811920166, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8437666296958923, + "num_tokens": 748497227.0, + "step": 19614 + }, + { + "epoch": 2.495229614552856, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0121164321899414, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8621866106987, + "num_tokens": 748529901.0, + "step": 19615 + }, + { + "epoch": 2.4953568248314464, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8630436658859253, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8521175384521484, + "num_tokens": 748568238.0, + "step": 19616 + }, + { + "epoch": 2.495484035110037, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8688453435897827, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8542879223823547, + "num_tokens": 748608153.0, + "step": 19617 + }, + { + "epoch": 2.4956112453886274, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8770931959152222, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8598844408988953, + "num_tokens": 748646977.0, + "step": 19618 + }, + { + "epoch": 2.495738455667218, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8075000047683716, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8553131222724915, + "num_tokens": 748689586.0, + "step": 19619 + }, + { + "epoch": 2.4958656659458085, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9094566106796265, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8633046746253967, + "num_tokens": 748730915.0, + "step": 19620 + }, + { + "epoch": 2.495992876224399, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9569646120071411, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8597725033760071, + "num_tokens": 748766996.0, + "step": 19621 + }, + { + "epoch": 2.4961200865029896, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0079586505889893, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8573505282402039, + "num_tokens": 748801225.0, + "step": 19622 + }, + { + "epoch": 2.49624729678158, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.787798523902893, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.859158456325531, + "num_tokens": 748841355.0, + "step": 19623 + }, + { + "epoch": 2.4963745070601706, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.882181167602539, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8636506795883179, + "num_tokens": 748879636.0, + "step": 19624 + }, + { + "epoch": 2.496501717338761, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9108227491378784, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8609390258789062, + "num_tokens": 748917795.0, + "step": 19625 + }, + { + "epoch": 2.4966289276173517, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.972488284111023, + "learning_rate": 1e-06, + "loss": 0.526, + "mean_token_accuracy": 0.836375892162323, + "num_tokens": 748956512.0, + "step": 19626 + }, + { + "epoch": 2.496756137895942, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8813868761062622, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8688539862632751, + "num_tokens": 748995577.0, + "step": 19627 + }, + { + "epoch": 2.4968833481745323, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1427366733551025, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8601214289665222, + "num_tokens": 749027672.0, + "step": 19628 + }, + { + "epoch": 2.4970105584531233, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1003518104553223, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8741995096206665, + "num_tokens": 749059101.0, + "step": 19629 + }, + { + "epoch": 2.4971377687317133, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.838775873184204, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8506627082824707, + "num_tokens": 749097034.0, + "step": 19630 + }, + { + "epoch": 2.497264979010304, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7796708345413208, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8612872958183289, + "num_tokens": 749137536.0, + "step": 19631 + }, + { + "epoch": 2.4973921892888944, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8602222204208374, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8639177083969116, + "num_tokens": 749171487.0, + "step": 19632 + }, + { + "epoch": 2.497519399567485, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8570821285247803, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8565832376480103, + "num_tokens": 749213234.0, + "step": 19633 + }, + { + "epoch": 2.4976466098460754, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9092535972595215, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8534163236618042, + "num_tokens": 749256184.0, + "step": 19634 + }, + { + "epoch": 2.497773820124666, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.5350875854492188, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8622965812683105, + "num_tokens": 749292909.0, + "step": 19635 + }, + { + "epoch": 2.4979010304032565, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1679394245147705, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8507283926010132, + "num_tokens": 749327947.0, + "step": 19636 + }, + { + "epoch": 2.498028240681847, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0775794982910156, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8726204633712769, + "num_tokens": 749364107.0, + "step": 19637 + }, + { + "epoch": 2.4981554509604376, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1299502849578857, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8795384168624878, + "num_tokens": 749398055.0, + "step": 19638 + }, + { + "epoch": 2.498282661239028, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9001398086547852, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8628119826316833, + "num_tokens": 749434626.0, + "step": 19639 + }, + { + "epoch": 2.4984098715176186, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.886257529258728, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.858434796333313, + "num_tokens": 749470616.0, + "step": 19640 + }, + { + "epoch": 2.498537081796209, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.861184000968933, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8665950894355774, + "num_tokens": 749513072.0, + "step": 19641 + }, + { + "epoch": 2.4986642920747997, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.015953540802002, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8774397969245911, + "num_tokens": 749546988.0, + "step": 19642 + }, + { + "epoch": 2.49879150235339, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8666377067565918, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8564527034759521, + "num_tokens": 749585851.0, + "step": 19643 + }, + { + "epoch": 2.4989187126319807, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.116771697998047, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8614070415496826, + "num_tokens": 749619510.0, + "step": 19644 + }, + { + "epoch": 2.4990459229105713, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.139540910720825, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.871898889541626, + "num_tokens": 749653660.0, + "step": 19645 + }, + { + "epoch": 2.499173133189162, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.929057002067566, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8640472888946533, + "num_tokens": 749691321.0, + "step": 19646 + }, + { + "epoch": 2.4993003434677523, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.840658187866211, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8645606637001038, + "num_tokens": 749729305.0, + "step": 19647 + }, + { + "epoch": 2.499427553746343, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.611753463745117, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8704971075057983, + "num_tokens": 749766134.0, + "step": 19648 + }, + { + "epoch": 2.4995547640249334, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.189890146255493, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8609678745269775, + "num_tokens": 749807715.0, + "step": 19649 + }, + { + "epoch": 2.499681974303524, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.217242956161499, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8645750880241394, + "num_tokens": 749838799.0, + "step": 19650 + }, + { + "epoch": 2.499809184582114, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9068865776062012, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8769164681434631, + "num_tokens": 749874283.0, + "step": 19651 + }, + { + "epoch": 2.499936394860705, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8876715898513794, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8622753024101257, + "num_tokens": 749915912.0, + "step": 19652 + }, + { + "epoch": 2.500063605139295, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7984120845794678, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8704240322113037, + "num_tokens": 749959133.0, + "step": 19653 + }, + { + "epoch": 2.500190815417886, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9154301881790161, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8614469170570374, + "num_tokens": 749997846.0, + "step": 19654 + }, + { + "epoch": 2.500318025696476, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9009696245193481, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.869278073310852, + "num_tokens": 750040973.0, + "step": 19655 + }, + { + "epoch": 2.5004452359750666, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9062187671661377, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8597502708435059, + "num_tokens": 750082171.0, + "step": 19656 + }, + { + "epoch": 2.500572446253657, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.821907877922058, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8761164546012878, + "num_tokens": 750123083.0, + "step": 19657 + }, + { + "epoch": 2.5006996565322477, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.052013874053955, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8502830862998962, + "num_tokens": 750162235.0, + "step": 19658 + }, + { + "epoch": 2.500826866810838, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9516762495040894, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8759324550628662, + "num_tokens": 750196501.0, + "step": 19659 + }, + { + "epoch": 2.5009540770894287, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9473927021026611, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8366569876670837, + "num_tokens": 750241178.0, + "step": 19660 + }, + { + "epoch": 2.5010812873680193, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.766629695892334, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8684819936752319, + "num_tokens": 750284104.0, + "step": 19661 + }, + { + "epoch": 2.50120849764661, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9932233095169067, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.87090003490448, + "num_tokens": 750318525.0, + "step": 19662 + }, + { + "epoch": 2.5013357079252003, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8224036693572998, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8724621534347534, + "num_tokens": 750358892.0, + "step": 19663 + }, + { + "epoch": 2.501462918203791, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7957963943481445, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8518029451370239, + "num_tokens": 750399325.0, + "step": 19664 + }, + { + "epoch": 2.5015901284823814, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8070392608642578, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8587384223937988, + "num_tokens": 750437493.0, + "step": 19665 + }, + { + "epoch": 2.501717338760972, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8426640033721924, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8716331124305725, + "num_tokens": 750472381.0, + "step": 19666 + }, + { + "epoch": 2.5018445490395624, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9139723777770996, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.846442699432373, + "num_tokens": 750511786.0, + "step": 19667 + }, + { + "epoch": 2.501971759318153, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 16.606565475463867, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8709198236465454, + "num_tokens": 750550596.0, + "step": 19668 + }, + { + "epoch": 2.5020989695967435, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.955026388168335, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8680158853530884, + "num_tokens": 750594790.0, + "step": 19669 + }, + { + "epoch": 2.502226179875334, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8936959505081177, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.876105010509491, + "num_tokens": 750636738.0, + "step": 19670 + }, + { + "epoch": 2.5023533901539246, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8569689989089966, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8768867254257202, + "num_tokens": 750672338.0, + "step": 19671 + }, + { + "epoch": 2.502480600432515, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8741661310195923, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8574529886245728, + "num_tokens": 750709149.0, + "step": 19672 + }, + { + "epoch": 2.5026078107111056, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8575178384780884, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8763080835342407, + "num_tokens": 750741727.0, + "step": 19673 + }, + { + "epoch": 2.5027350209896957, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9521377086639404, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8528282642364502, + "num_tokens": 750775903.0, + "step": 19674 + }, + { + "epoch": 2.5028622312682867, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8504327535629272, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8597948551177979, + "num_tokens": 750810502.0, + "step": 19675 + }, + { + "epoch": 2.5029894415468767, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9567142724990845, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8733579516410828, + "num_tokens": 750847762.0, + "step": 19676 + }, + { + "epoch": 2.5031166518254677, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9490430355072021, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8655990362167358, + "num_tokens": 750887201.0, + "step": 19677 + }, + { + "epoch": 2.503243862104058, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9218758344650269, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8636113405227661, + "num_tokens": 750927323.0, + "step": 19678 + }, + { + "epoch": 2.5033710723826488, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.716210961341858, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8553529977798462, + "num_tokens": 750972501.0, + "step": 19679 + }, + { + "epoch": 2.503498282661239, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8463443517684937, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.874392032623291, + "num_tokens": 751010743.0, + "step": 19680 + }, + { + "epoch": 2.5036254929398294, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.032245397567749, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8592542409896851, + "num_tokens": 751050440.0, + "step": 19681 + }, + { + "epoch": 2.50375270321842, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9490891695022583, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8681095838546753, + "num_tokens": 751088319.0, + "step": 19682 + }, + { + "epoch": 2.5038799134970104, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9883593320846558, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8640825748443604, + "num_tokens": 751123766.0, + "step": 19683 + }, + { + "epoch": 2.504007123775601, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7921324968338013, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8824397325515747, + "num_tokens": 751163375.0, + "step": 19684 + }, + { + "epoch": 2.5041343340541915, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8128911256790161, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8595622181892395, + "num_tokens": 751204526.0, + "step": 19685 + }, + { + "epoch": 2.504261544332782, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0410001277923584, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8691082000732422, + "num_tokens": 751245133.0, + "step": 19686 + }, + { + "epoch": 2.5043887546113726, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9278348684310913, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.853935956954956, + "num_tokens": 751284578.0, + "step": 19687 + }, + { + "epoch": 2.504515964889963, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9747917652130127, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8565728664398193, + "num_tokens": 751325794.0, + "step": 19688 + }, + { + "epoch": 2.5046431751685536, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.721083402633667, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8703468441963196, + "num_tokens": 751363777.0, + "step": 19689 + }, + { + "epoch": 2.504770385447144, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9535051584243774, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8553981781005859, + "num_tokens": 751400499.0, + "step": 19690 + }, + { + "epoch": 2.5048975957257347, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9717074632644653, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8611726760864258, + "num_tokens": 751434895.0, + "step": 19691 + }, + { + "epoch": 2.505024806004325, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7549196481704712, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8746887445449829, + "num_tokens": 751473984.0, + "step": 19692 + }, + { + "epoch": 2.5051520162829157, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8430006504058838, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8644081950187683, + "num_tokens": 751515030.0, + "step": 19693 + }, + { + "epoch": 2.5052792265615063, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9147067070007324, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8629469871520996, + "num_tokens": 751550023.0, + "step": 19694 + }, + { + "epoch": 2.505406436840097, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9542460441589355, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8646047115325928, + "num_tokens": 751584793.0, + "step": 19695 + }, + { + "epoch": 2.5055336471186873, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.013183116912842, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8454706072807312, + "num_tokens": 751619817.0, + "step": 19696 + }, + { + "epoch": 2.505660857397278, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8852322101593018, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8711233735084534, + "num_tokens": 751657097.0, + "step": 19697 + }, + { + "epoch": 2.5057880676758684, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 3.995121479034424, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8544785380363464, + "num_tokens": 751687042.0, + "step": 19698 + }, + { + "epoch": 2.5059152779544585, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9802782535552979, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.873104989528656, + "num_tokens": 751724281.0, + "step": 19699 + }, + { + "epoch": 2.5060424882330494, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2416629791259766, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8447674512863159, + "num_tokens": 751765144.0, + "step": 19700 + }, + { + "epoch": 2.5061696985116395, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0200283527374268, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.873162031173706, + "num_tokens": 751799602.0, + "step": 19701 + }, + { + "epoch": 2.5062969087902305, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8222980499267578, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.872897207736969, + "num_tokens": 751836972.0, + "step": 19702 + }, + { + "epoch": 2.5064241190688206, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0161187648773193, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8654447793960571, + "num_tokens": 751869627.0, + "step": 19703 + }, + { + "epoch": 2.5065513293474115, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8470486402511597, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8578491806983948, + "num_tokens": 751911103.0, + "step": 19704 + }, + { + "epoch": 2.5066785396260016, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1162421703338623, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8587743639945984, + "num_tokens": 751948956.0, + "step": 19705 + }, + { + "epoch": 2.506805749904592, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7979419231414795, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.867042601108551, + "num_tokens": 751992185.0, + "step": 19706 + }, + { + "epoch": 2.5069329601831827, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.798811912536621, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8602070808410645, + "num_tokens": 752032661.0, + "step": 19707 + }, + { + "epoch": 2.507060170461773, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.851072907447815, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8693030476570129, + "num_tokens": 752073946.0, + "step": 19708 + }, + { + "epoch": 2.5071873807403637, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9611188173294067, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8480087518692017, + "num_tokens": 752118766.0, + "step": 19709 + }, + { + "epoch": 2.5073145910189543, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7891114950180054, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8676809668540955, + "num_tokens": 752159669.0, + "step": 19710 + }, + { + "epoch": 2.507441801297545, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.839249610900879, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8676033020019531, + "num_tokens": 752198139.0, + "step": 19711 + }, + { + "epoch": 2.5075690115761353, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.162210702896118, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8665826320648193, + "num_tokens": 752239686.0, + "step": 19712 + }, + { + "epoch": 2.507696221854726, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9267387390136719, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.846779465675354, + "num_tokens": 752275740.0, + "step": 19713 + }, + { + "epoch": 2.5078234321333164, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0952870845794678, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8510168194770813, + "num_tokens": 752310554.0, + "step": 19714 + }, + { + "epoch": 2.507950642411907, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8317569494247437, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8699743747711182, + "num_tokens": 752347761.0, + "step": 19715 + }, + { + "epoch": 2.5080778526904974, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.049095392227173, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8527587056159973, + "num_tokens": 752387128.0, + "step": 19716 + }, + { + "epoch": 2.508205062969088, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7996431589126587, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8622531890869141, + "num_tokens": 752428722.0, + "step": 19717 + }, + { + "epoch": 2.5083322732476785, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8516249656677246, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8627914190292358, + "num_tokens": 752468920.0, + "step": 19718 + }, + { + "epoch": 2.508459483526269, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9695696830749512, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8546473979949951, + "num_tokens": 752505226.0, + "step": 19719 + }, + { + "epoch": 2.5085866938048595, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9207836389541626, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8655310869216919, + "num_tokens": 752543693.0, + "step": 19720 + }, + { + "epoch": 2.50871390408345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.909969449043274, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8484288454055786, + "num_tokens": 752578758.0, + "step": 19721 + }, + { + "epoch": 2.5088411143620406, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9188872575759888, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.87543123960495, + "num_tokens": 752612115.0, + "step": 19722 + }, + { + "epoch": 2.508968324640631, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.050438165664673, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8698077201843262, + "num_tokens": 752644601.0, + "step": 19723 + }, + { + "epoch": 2.509095534919221, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8745300769805908, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8688523769378662, + "num_tokens": 752682333.0, + "step": 19724 + }, + { + "epoch": 2.509222745197812, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.685861587524414, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8594855070114136, + "num_tokens": 752721802.0, + "step": 19725 + }, + { + "epoch": 2.5093499554764023, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9538259506225586, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8792469501495361, + "num_tokens": 752755673.0, + "step": 19726 + }, + { + "epoch": 2.5094771657549932, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.272421360015869, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8748656511306763, + "num_tokens": 752795575.0, + "step": 19727 + }, + { + "epoch": 2.5096043760335833, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8520017862319946, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8813937902450562, + "num_tokens": 752833172.0, + "step": 19728 + }, + { + "epoch": 2.5097315863121743, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.8036599159240723, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8501982688903809, + "num_tokens": 752872857.0, + "step": 19729 + }, + { + "epoch": 2.5098587965907644, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.055795431137085, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8734496831893921, + "num_tokens": 752908553.0, + "step": 19730 + }, + { + "epoch": 2.509986006869355, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.065837860107422, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8703725337982178, + "num_tokens": 752937762.0, + "step": 19731 + }, + { + "epoch": 2.5101132171479454, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.928490161895752, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8505529761314392, + "num_tokens": 752977184.0, + "step": 19732 + }, + { + "epoch": 2.510240427426536, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9511194229125977, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8589086532592773, + "num_tokens": 753021071.0, + "step": 19733 + }, + { + "epoch": 2.5103676377051265, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.993120551109314, + "learning_rate": 1e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.8382117748260498, + "num_tokens": 753056597.0, + "step": 19734 + }, + { + "epoch": 2.510494847983717, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9076615571975708, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8666229248046875, + "num_tokens": 753096178.0, + "step": 19735 + }, + { + "epoch": 2.5106220582623076, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.875777244567871, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8676170706748962, + "num_tokens": 753130388.0, + "step": 19736 + }, + { + "epoch": 2.510749268540898, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.779192328453064, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8728091716766357, + "num_tokens": 753166585.0, + "step": 19737 + }, + { + "epoch": 2.5108764788194886, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8870078325271606, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8740332126617432, + "num_tokens": 753203403.0, + "step": 19738 + }, + { + "epoch": 2.511003689098079, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8891246318817139, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8704249858856201, + "num_tokens": 753241632.0, + "step": 19739 + }, + { + "epoch": 2.5111308993766697, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.388542413711548, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8779094219207764, + "num_tokens": 753279292.0, + "step": 19740 + }, + { + "epoch": 2.51125810965526, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.952919602394104, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8538733124732971, + "num_tokens": 753319121.0, + "step": 19741 + }, + { + "epoch": 2.5113853199338507, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3392531871795654, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.85000079870224, + "num_tokens": 753353074.0, + "step": 19742 + }, + { + "epoch": 2.5115125302124413, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8272826671600342, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8822033405303955, + "num_tokens": 753388994.0, + "step": 19743 + }, + { + "epoch": 2.511639740491032, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.867903709411621, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8703882694244385, + "num_tokens": 753430271.0, + "step": 19744 + }, + { + "epoch": 2.5117669507696223, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.6745986938476562, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.85710608959198, + "num_tokens": 753473820.0, + "step": 19745 + }, + { + "epoch": 2.511894161048213, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.790374755859375, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8603375554084778, + "num_tokens": 753516397.0, + "step": 19746 + }, + { + "epoch": 2.5120213713268034, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7792367935180664, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8747377395629883, + "num_tokens": 753557589.0, + "step": 19747 + }, + { + "epoch": 2.512148581605394, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8901933431625366, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8734584450721741, + "num_tokens": 753593764.0, + "step": 19748 + }, + { + "epoch": 2.512275791883984, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.845005989074707, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8517366051673889, + "num_tokens": 753635967.0, + "step": 19749 + }, + { + "epoch": 2.512403002162575, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.044447660446167, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8725433349609375, + "num_tokens": 753674595.0, + "step": 19750 + }, + { + "epoch": 2.512530212441165, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8959810733795166, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8486090302467346, + "num_tokens": 753714118.0, + "step": 19751 + }, + { + "epoch": 2.512657422719756, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.933354377746582, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8499314785003662, + "num_tokens": 753758139.0, + "step": 19752 + }, + { + "epoch": 2.512784632998346, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.826873540878296, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8531835079193115, + "num_tokens": 753800373.0, + "step": 19753 + }, + { + "epoch": 2.5129118432769366, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2390172481536865, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8600255250930786, + "num_tokens": 753832366.0, + "step": 19754 + }, + { + "epoch": 2.513039053555527, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9150960445404053, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8829525709152222, + "num_tokens": 753866176.0, + "step": 19755 + }, + { + "epoch": 2.5131662638341177, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7029571533203125, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8633553981781006, + "num_tokens": 753911470.0, + "step": 19756 + }, + { + "epoch": 2.513293474112708, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8845109939575195, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8624098300933838, + "num_tokens": 753953027.0, + "step": 19757 + }, + { + "epoch": 2.5134206843912987, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9505870342254639, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8590754270553589, + "num_tokens": 753987992.0, + "step": 19758 + }, + { + "epoch": 2.5135478946698893, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9661682844161987, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8685575723648071, + "num_tokens": 754026671.0, + "step": 19759 + }, + { + "epoch": 2.51367510494848, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.86945641040802, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8640890121459961, + "num_tokens": 754063526.0, + "step": 19760 + }, + { + "epoch": 2.5138023152270703, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.996787190437317, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8488490581512451, + "num_tokens": 754103361.0, + "step": 19761 + }, + { + "epoch": 2.513929525505661, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.913405418395996, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8664084672927856, + "num_tokens": 754141969.0, + "step": 19762 + }, + { + "epoch": 2.5140567357842514, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.747101068496704, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8803197145462036, + "num_tokens": 754182691.0, + "step": 19763 + }, + { + "epoch": 2.514183946062842, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9062612056732178, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8644250631332397, + "num_tokens": 754219956.0, + "step": 19764 + }, + { + "epoch": 2.5143111563414324, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8168243169784546, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8593509197235107, + "num_tokens": 754259467.0, + "step": 19765 + }, + { + "epoch": 2.514438366620023, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8398233652114868, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8671547174453735, + "num_tokens": 754300344.0, + "step": 19766 + }, + { + "epoch": 2.5145655768986135, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7479346990585327, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8652439713478088, + "num_tokens": 754342606.0, + "step": 19767 + }, + { + "epoch": 2.514692787177204, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9147202968597412, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8605920076370239, + "num_tokens": 754379946.0, + "step": 19768 + }, + { + "epoch": 2.5148199974557945, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.895566463470459, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8781393766403198, + "num_tokens": 754414353.0, + "step": 19769 + }, + { + "epoch": 2.514947207734385, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.988905429840088, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8501737713813782, + "num_tokens": 754455506.0, + "step": 19770 + }, + { + "epoch": 2.5150744180129756, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9122096300125122, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8480623960494995, + "num_tokens": 754498316.0, + "step": 19771 + }, + { + "epoch": 2.5152016282915657, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.182084083557129, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8530123233795166, + "num_tokens": 754534943.0, + "step": 19772 + }, + { + "epoch": 2.5153288385701567, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9510977268218994, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8613395094871521, + "num_tokens": 754572775.0, + "step": 19773 + }, + { + "epoch": 2.5154560488487467, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8948403596878052, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8737940788269043, + "num_tokens": 754610473.0, + "step": 19774 + }, + { + "epoch": 2.5155832591273377, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9241149425506592, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8651952743530273, + "num_tokens": 754649527.0, + "step": 19775 + }, + { + "epoch": 2.515710469405928, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8061152696609497, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8666740655899048, + "num_tokens": 754691513.0, + "step": 19776 + }, + { + "epoch": 2.5158376796845188, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8955053091049194, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.851410984992981, + "num_tokens": 754727932.0, + "step": 19777 + }, + { + "epoch": 2.515964889963109, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9977848529815674, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8702138662338257, + "num_tokens": 754764571.0, + "step": 19778 + }, + { + "epoch": 2.5160921002416994, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8299845457077026, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8783029317855835, + "num_tokens": 754805846.0, + "step": 19779 + }, + { + "epoch": 2.51621931052029, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8256621360778809, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8735620379447937, + "num_tokens": 754844016.0, + "step": 19780 + }, + { + "epoch": 2.5163465207988804, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7535154819488525, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.864283561706543, + "num_tokens": 754886493.0, + "step": 19781 + }, + { + "epoch": 2.516473731077471, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7807130813598633, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8670529127120972, + "num_tokens": 754928195.0, + "step": 19782 + }, + { + "epoch": 2.5166009413560615, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0754966735839844, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8735559582710266, + "num_tokens": 754958399.0, + "step": 19783 + }, + { + "epoch": 2.516728151634652, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9785984754562378, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8686342239379883, + "num_tokens": 754997926.0, + "step": 19784 + }, + { + "epoch": 2.5168553619132426, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7998062372207642, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8708984851837158, + "num_tokens": 755040577.0, + "step": 19785 + }, + { + "epoch": 2.516982572191833, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.028193235397339, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8611210584640503, + "num_tokens": 755070458.0, + "step": 19786 + }, + { + "epoch": 2.5171097824704236, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7936782836914062, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8436756730079651, + "num_tokens": 755117205.0, + "step": 19787 + }, + { + "epoch": 2.517236992749014, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7700642347335815, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8674557209014893, + "num_tokens": 755162098.0, + "step": 19788 + }, + { + "epoch": 2.5173642030276047, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9098834991455078, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8626692295074463, + "num_tokens": 755199655.0, + "step": 19789 + }, + { + "epoch": 2.517491413306195, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8890588283538818, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8785619735717773, + "num_tokens": 755234165.0, + "step": 19790 + }, + { + "epoch": 2.5176186235847857, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1007931232452393, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8649175763130188, + "num_tokens": 755271933.0, + "step": 19791 + }, + { + "epoch": 2.5177458338633762, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.087888717651367, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8736984133720398, + "num_tokens": 755306903.0, + "step": 19792 + }, + { + "epoch": 2.5178730441419668, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8524025678634644, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8728646039962769, + "num_tokens": 755347613.0, + "step": 19793 + }, + { + "epoch": 2.5180002544205573, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9230842590332031, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8840662240982056, + "num_tokens": 755380541.0, + "step": 19794 + }, + { + "epoch": 2.518127464699148, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.947538137435913, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8636845350265503, + "num_tokens": 755422502.0, + "step": 19795 + }, + { + "epoch": 2.5182546749777384, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3810932636260986, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8659476041793823, + "num_tokens": 755463975.0, + "step": 19796 + }, + { + "epoch": 2.5183818852563284, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8810181617736816, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8652106523513794, + "num_tokens": 755504873.0, + "step": 19797 + }, + { + "epoch": 2.5185090955349194, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9985935688018799, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8402221202850342, + "num_tokens": 755545646.0, + "step": 19798 + }, + { + "epoch": 2.5186363058135095, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1556408405303955, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8520656824111938, + "num_tokens": 755577228.0, + "step": 19799 + }, + { + "epoch": 2.5187635160921005, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7870604991912842, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8732955455780029, + "num_tokens": 755621437.0, + "step": 19800 + }, + { + "epoch": 2.5188907263706906, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9576332569122314, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.863964319229126, + "num_tokens": 755658266.0, + "step": 19801 + }, + { + "epoch": 2.5190179366492815, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9072544574737549, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8513561487197876, + "num_tokens": 755696755.0, + "step": 19802 + }, + { + "epoch": 2.5191451469278716, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9441872835159302, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8788361549377441, + "num_tokens": 755732434.0, + "step": 19803 + }, + { + "epoch": 2.519272357206462, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9064184427261353, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8682225942611694, + "num_tokens": 755768488.0, + "step": 19804 + }, + { + "epoch": 2.5193995674850527, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0185627937316895, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8701276779174805, + "num_tokens": 755799637.0, + "step": 19805 + }, + { + "epoch": 2.519526777763643, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.046201229095459, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8500685095787048, + "num_tokens": 755834229.0, + "step": 19806 + }, + { + "epoch": 2.5196539880422337, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8495044708251953, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8690052628517151, + "num_tokens": 755871776.0, + "step": 19807 + }, + { + "epoch": 2.5197811983208243, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8260058164596558, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8754768371582031, + "num_tokens": 755911125.0, + "step": 19808 + }, + { + "epoch": 2.519908408599415, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8777236938476562, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8739343881607056, + "num_tokens": 755952204.0, + "step": 19809 + }, + { + "epoch": 2.5200356188780053, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1941397190093994, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8551120758056641, + "num_tokens": 755996702.0, + "step": 19810 + }, + { + "epoch": 2.520162829156596, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9065366983413696, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8826477527618408, + "num_tokens": 756033285.0, + "step": 19811 + }, + { + "epoch": 2.5202900394351864, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8990845680236816, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8649494051933289, + "num_tokens": 756068851.0, + "step": 19812 + }, + { + "epoch": 2.520417249713777, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.851714849472046, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8567737340927124, + "num_tokens": 756111676.0, + "step": 19813 + }, + { + "epoch": 2.5205444599923674, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0344231128692627, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8478257060050964, + "num_tokens": 756146575.0, + "step": 19814 + }, + { + "epoch": 2.520671670270958, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9134141206741333, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8545559048652649, + "num_tokens": 756190006.0, + "step": 19815 + }, + { + "epoch": 2.5207988805495485, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6997407674789429, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8759527802467346, + "num_tokens": 756232828.0, + "step": 19816 + }, + { + "epoch": 2.520926090828139, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.866439700126648, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8700971603393555, + "num_tokens": 756273077.0, + "step": 19817 + }, + { + "epoch": 2.5210533011067295, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.978265643119812, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8488704562187195, + "num_tokens": 756313421.0, + "step": 19818 + }, + { + "epoch": 2.52118051138532, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4594480991363525, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8596655130386353, + "num_tokens": 756356461.0, + "step": 19819 + }, + { + "epoch": 2.5213077216639106, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2269680500030518, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8599549531936646, + "num_tokens": 756388309.0, + "step": 19820 + }, + { + "epoch": 2.521434931942501, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 20.46368408203125, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8765643835067749, + "num_tokens": 756426655.0, + "step": 19821 + }, + { + "epoch": 2.521562142221091, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.287018060684204, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8606967926025391, + "num_tokens": 756464961.0, + "step": 19822 + }, + { + "epoch": 2.521689352499682, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0500054359436035, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8790066838264465, + "num_tokens": 756497338.0, + "step": 19823 + }, + { + "epoch": 2.5218165627782723, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8885060548782349, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8711076974868774, + "num_tokens": 756535324.0, + "step": 19824 + }, + { + "epoch": 2.5219437730568632, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8861321210861206, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8743650913238525, + "num_tokens": 756574138.0, + "step": 19825 + }, + { + "epoch": 2.5220709833354533, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9387147426605225, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8751709461212158, + "num_tokens": 756608967.0, + "step": 19826 + }, + { + "epoch": 2.522198193614044, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.97393000125885, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8434069156646729, + "num_tokens": 756653882.0, + "step": 19827 + }, + { + "epoch": 2.5223254038926344, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8277599811553955, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8599079847335815, + "num_tokens": 756695638.0, + "step": 19828 + }, + { + "epoch": 2.522452614171225, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.090744733810425, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8553195595741272, + "num_tokens": 756734964.0, + "step": 19829 + }, + { + "epoch": 2.5225798244498154, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9332948923110962, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8682110905647278, + "num_tokens": 756768454.0, + "step": 19830 + }, + { + "epoch": 2.522707034728406, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0577986240386963, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8535040616989136, + "num_tokens": 756801696.0, + "step": 19831 + }, + { + "epoch": 2.5228342450069965, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0003445148468018, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8680179715156555, + "num_tokens": 756839874.0, + "step": 19832 + }, + { + "epoch": 2.522961455285587, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4058115482330322, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8543976545333862, + "num_tokens": 756884179.0, + "step": 19833 + }, + { + "epoch": 2.5230886655641775, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8502219915390015, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8666989803314209, + "num_tokens": 756922725.0, + "step": 19834 + }, + { + "epoch": 2.523215875842768, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9612232446670532, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8613115549087524, + "num_tokens": 756956080.0, + "step": 19835 + }, + { + "epoch": 2.5233430861213586, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9033094644546509, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8549422025680542, + "num_tokens": 756995137.0, + "step": 19836 + }, + { + "epoch": 2.523470296399949, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9911282062530518, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8687399625778198, + "num_tokens": 757035300.0, + "step": 19837 + }, + { + "epoch": 2.5235975066785397, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.020007848739624, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8394784331321716, + "num_tokens": 757068789.0, + "step": 19838 + }, + { + "epoch": 2.52372471695713, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9886372089385986, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8580633401870728, + "num_tokens": 757102913.0, + "step": 19839 + }, + { + "epoch": 2.5238519272357207, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9990122318267822, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.866614818572998, + "num_tokens": 757138932.0, + "step": 19840 + }, + { + "epoch": 2.5239791375143112, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.796962022781372, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8674697279930115, + "num_tokens": 757179181.0, + "step": 19841 + }, + { + "epoch": 2.5241063477929018, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.886338710784912, + "learning_rate": 1e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8338509798049927, + "num_tokens": 757224209.0, + "step": 19842 + }, + { + "epoch": 2.5242335580714923, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9129023551940918, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8784165382385254, + "num_tokens": 757262398.0, + "step": 19843 + }, + { + "epoch": 2.524360768350083, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7931463718414307, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8546436429023743, + "num_tokens": 757301744.0, + "step": 19844 + }, + { + "epoch": 2.5244879786286734, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.878800392150879, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8685795664787292, + "num_tokens": 757338273.0, + "step": 19845 + }, + { + "epoch": 2.524615188907264, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.736745834350586, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8577003479003906, + "num_tokens": 757383169.0, + "step": 19846 + }, + { + "epoch": 2.524742399185854, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.861547589302063, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.854137659072876, + "num_tokens": 757420998.0, + "step": 19847 + }, + { + "epoch": 2.524869609464445, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7752695083618164, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8688982725143433, + "num_tokens": 757463281.0, + "step": 19848 + }, + { + "epoch": 2.524996819743035, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0696568489074707, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8577201962471008, + "num_tokens": 757495022.0, + "step": 19849 + }, + { + "epoch": 2.525124030021626, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3203213214874268, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8692277669906616, + "num_tokens": 757536801.0, + "step": 19850 + }, + { + "epoch": 2.525251240300216, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8771722316741943, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8797515630722046, + "num_tokens": 757573419.0, + "step": 19851 + }, + { + "epoch": 2.5253784505788066, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9536007642745972, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8568910360336304, + "num_tokens": 757610773.0, + "step": 19852 + }, + { + "epoch": 2.525505660857397, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 3.9490342140197754, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8748027086257935, + "num_tokens": 757648067.0, + "step": 19853 + }, + { + "epoch": 2.5256328711359877, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.026740312576294, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8709349632263184, + "num_tokens": 757681598.0, + "step": 19854 + }, + { + "epoch": 2.525760081414578, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.014010190963745, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8457534909248352, + "num_tokens": 757724752.0, + "step": 19855 + }, + { + "epoch": 2.5258872916931687, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.894508957862854, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8755157589912415, + "num_tokens": 757763801.0, + "step": 19856 + }, + { + "epoch": 2.5260145019717593, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.146390438079834, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8617937564849854, + "num_tokens": 757799235.0, + "step": 19857 + }, + { + "epoch": 2.52614171225035, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.842295527458191, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8677452206611633, + "num_tokens": 757839695.0, + "step": 19858 + }, + { + "epoch": 2.5262689225289403, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8890022039413452, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8520187139511108, + "num_tokens": 757877152.0, + "step": 19859 + }, + { + "epoch": 2.526396132807531, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8070321083068848, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8635022640228271, + "num_tokens": 757916173.0, + "step": 19860 + }, + { + "epoch": 2.5265233430861214, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9200884103775024, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8695807456970215, + "num_tokens": 757957271.0, + "step": 19861 + }, + { + "epoch": 2.526650553364712, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9050358533859253, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8586013317108154, + "num_tokens": 757995553.0, + "step": 19862 + }, + { + "epoch": 2.5267777636433024, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8301881551742554, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8537886142730713, + "num_tokens": 758035095.0, + "step": 19863 + }, + { + "epoch": 2.526904973921893, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.60990333557129, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8639767169952393, + "num_tokens": 758071185.0, + "step": 19864 + }, + { + "epoch": 2.5270321842004835, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.1151881217956543, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8658373355865479, + "num_tokens": 758110251.0, + "step": 19865 + }, + { + "epoch": 2.527159394479074, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.5400028228759766, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8509661555290222, + "num_tokens": 758144385.0, + "step": 19866 + }, + { + "epoch": 2.5272866047576645, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8665333986282349, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8760643005371094, + "num_tokens": 758185935.0, + "step": 19867 + }, + { + "epoch": 2.527413815036255, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7919459342956543, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8684274554252625, + "num_tokens": 758226688.0, + "step": 19868 + }, + { + "epoch": 2.5275410253148456, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.863797664642334, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8701270222663879, + "num_tokens": 758263866.0, + "step": 19869 + }, + { + "epoch": 2.5276682355934357, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.7555832862854004, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8707419037818909, + "num_tokens": 758303607.0, + "step": 19870 + }, + { + "epoch": 2.5277954458720266, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.99198579788208, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8671836256980896, + "num_tokens": 758340120.0, + "step": 19871 + }, + { + "epoch": 2.5279226561506167, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.211113691329956, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8619205355644226, + "num_tokens": 758373792.0, + "step": 19872 + }, + { + "epoch": 2.5280498664292077, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0992071628570557, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8524677753448486, + "num_tokens": 758418157.0, + "step": 19873 + }, + { + "epoch": 2.528177076707798, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9243947267532349, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8561116456985474, + "num_tokens": 758454042.0, + "step": 19874 + }, + { + "epoch": 2.5283042869863888, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8006302118301392, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8662433624267578, + "num_tokens": 758495305.0, + "step": 19875 + }, + { + "epoch": 2.528431497264979, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9229936599731445, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8711246252059937, + "num_tokens": 758532646.0, + "step": 19876 + }, + { + "epoch": 2.5285587075435694, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.95854914188385, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8592350482940674, + "num_tokens": 758568329.0, + "step": 19877 + }, + { + "epoch": 2.52868591782216, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.1852831840515137, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8681416511535645, + "num_tokens": 758608246.0, + "step": 19878 + }, + { + "epoch": 2.5288131281007504, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.819556713104248, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8743880987167358, + "num_tokens": 758645356.0, + "step": 19879 + }, + { + "epoch": 2.528940338379341, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8385714292526245, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8803011178970337, + "num_tokens": 758685657.0, + "step": 19880 + }, + { + "epoch": 2.5290675486579315, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0052757263183594, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8687472343444824, + "num_tokens": 758718843.0, + "step": 19881 + }, + { + "epoch": 2.529194758936522, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9132863283157349, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8695213794708252, + "num_tokens": 758757946.0, + "step": 19882 + }, + { + "epoch": 2.5293219692151125, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.144850254058838, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8443518877029419, + "num_tokens": 758792832.0, + "step": 19883 + }, + { + "epoch": 2.529449179493703, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0593249797821045, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8561869859695435, + "num_tokens": 758826062.0, + "step": 19884 + }, + { + "epoch": 2.5295763897722936, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.958424687385559, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8745970726013184, + "num_tokens": 758860940.0, + "step": 19885 + }, + { + "epoch": 2.529703600050884, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.879967212677002, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8639999628067017, + "num_tokens": 758900740.0, + "step": 19886 + }, + { + "epoch": 2.5298308103294747, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9020843505859375, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8603419065475464, + "num_tokens": 758942786.0, + "step": 19887 + }, + { + "epoch": 2.529958020608065, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8897913694381714, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8741499781608582, + "num_tokens": 758983376.0, + "step": 19888 + }, + { + "epoch": 2.5300852308866557, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8474383354187012, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.852874755859375, + "num_tokens": 759024007.0, + "step": 19889 + }, + { + "epoch": 2.5302124411652462, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8763190507888794, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8730199933052063, + "num_tokens": 759061611.0, + "step": 19890 + }, + { + "epoch": 2.5303396514438368, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9088413715362549, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.847150444984436, + "num_tokens": 759101942.0, + "step": 19891 + }, + { + "epoch": 2.5304668617224273, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.872017741203308, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8808932304382324, + "num_tokens": 759140281.0, + "step": 19892 + }, + { + "epoch": 2.530594072001018, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9212872982025146, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8642597198486328, + "num_tokens": 759174319.0, + "step": 19893 + }, + { + "epoch": 2.5307212822796084, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.878859519958496, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.871233344078064, + "num_tokens": 759208981.0, + "step": 19894 + }, + { + "epoch": 2.5308484925581984, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.009194850921631, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.865670919418335, + "num_tokens": 759240559.0, + "step": 19895 + }, + { + "epoch": 2.5309757028367894, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.872773289680481, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8758236765861511, + "num_tokens": 759280550.0, + "step": 19896 + }, + { + "epoch": 2.5311029131153795, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9337985515594482, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8576900959014893, + "num_tokens": 759321298.0, + "step": 19897 + }, + { + "epoch": 2.5312301233939705, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9774754047393799, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.875899612903595, + "num_tokens": 759354627.0, + "step": 19898 + }, + { + "epoch": 2.5313573336725606, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.114673614501953, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8603307008743286, + "num_tokens": 759388129.0, + "step": 19899 + }, + { + "epoch": 2.5314845439511515, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9354500770568848, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8749887347221375, + "num_tokens": 759423667.0, + "step": 19900 + }, + { + "epoch": 2.5316117542297416, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8055998086929321, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8558040857315063, + "num_tokens": 759466586.0, + "step": 19901 + }, + { + "epoch": 2.531738964508332, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9257595539093018, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8593305349349976, + "num_tokens": 759504214.0, + "step": 19902 + }, + { + "epoch": 2.5318661747869227, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9478044509887695, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.849419355392456, + "num_tokens": 759543865.0, + "step": 19903 + }, + { + "epoch": 2.531993385065513, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9847382307052612, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8593340516090393, + "num_tokens": 759581917.0, + "step": 19904 + }, + { + "epoch": 2.5321205953441037, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.988235354423523, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8664372563362122, + "num_tokens": 759617641.0, + "step": 19905 + }, + { + "epoch": 2.5322478056226942, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0050771236419678, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8758574724197388, + "num_tokens": 759649097.0, + "step": 19906 + }, + { + "epoch": 2.5323750159012848, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9168083667755127, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8596572875976562, + "num_tokens": 759688802.0, + "step": 19907 + }, + { + "epoch": 2.5325022261798753, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9256336688995361, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8733874559402466, + "num_tokens": 759724243.0, + "step": 19908 + }, + { + "epoch": 2.532629436458466, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.771847128868103, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8586266040802002, + "num_tokens": 759764646.0, + "step": 19909 + }, + { + "epoch": 2.5327566467370564, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8271510601043701, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8678877949714661, + "num_tokens": 759806939.0, + "step": 19910 + }, + { + "epoch": 2.532883857015647, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8423789739608765, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8680909872055054, + "num_tokens": 759848138.0, + "step": 19911 + }, + { + "epoch": 2.5330110672942374, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.829059362411499, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8612552285194397, + "num_tokens": 759881877.0, + "step": 19912 + }, + { + "epoch": 2.533138277572828, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8524863719940186, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8758021593093872, + "num_tokens": 759922625.0, + "step": 19913 + }, + { + "epoch": 2.5332654878514185, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8628591299057007, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8758414387702942, + "num_tokens": 759952429.0, + "step": 19914 + }, + { + "epoch": 2.533392698130009, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9872056245803833, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8668160438537598, + "num_tokens": 759987678.0, + "step": 19915 + }, + { + "epoch": 2.5335199084085995, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9300439357757568, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8514798879623413, + "num_tokens": 760023943.0, + "step": 19916 + }, + { + "epoch": 2.53364711868719, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0091381072998047, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8683280944824219, + "num_tokens": 760068229.0, + "step": 19917 + }, + { + "epoch": 2.5337743289657806, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.012235403060913, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8511449098587036, + "num_tokens": 760109440.0, + "step": 19918 + }, + { + "epoch": 2.533901539244371, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0205283164978027, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8649662137031555, + "num_tokens": 760146616.0, + "step": 19919 + }, + { + "epoch": 2.534028749522961, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8869158029556274, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8459424376487732, + "num_tokens": 760188708.0, + "step": 19920 + }, + { + "epoch": 2.534155959801552, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.735276222229004, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8812420964241028, + "num_tokens": 760231355.0, + "step": 19921 + }, + { + "epoch": 2.5342831700801423, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.011634588241577, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8513549566268921, + "num_tokens": 760268316.0, + "step": 19922 + }, + { + "epoch": 2.5344103803587332, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0413358211517334, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8558621406555176, + "num_tokens": 760305420.0, + "step": 19923 + }, + { + "epoch": 2.5345375906373233, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7503488063812256, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8837180733680725, + "num_tokens": 760342195.0, + "step": 19924 + }, + { + "epoch": 2.534664800915914, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8905894756317139, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8790497779846191, + "num_tokens": 760375394.0, + "step": 19925 + }, + { + "epoch": 2.5347920111945044, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9873493909835815, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8533815145492554, + "num_tokens": 760411629.0, + "step": 19926 + }, + { + "epoch": 2.534919221473095, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7147088050842285, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8661914467811584, + "num_tokens": 760456212.0, + "step": 19927 + }, + { + "epoch": 2.5350464317516854, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8891549110412598, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8566592335700989, + "num_tokens": 760496385.0, + "step": 19928 + }, + { + "epoch": 2.535173642030276, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9032864570617676, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8632845878601074, + "num_tokens": 760530426.0, + "step": 19929 + }, + { + "epoch": 2.5353008523088665, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8770873546600342, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8752736449241638, + "num_tokens": 760568174.0, + "step": 19930 + }, + { + "epoch": 2.535428062587457, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3590896129608154, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8829779624938965, + "num_tokens": 760608097.0, + "step": 19931 + }, + { + "epoch": 2.5355552728660475, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.081181287765503, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8649169206619263, + "num_tokens": 760643018.0, + "step": 19932 + }, + { + "epoch": 2.535682483144638, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.04970645904541, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8708875179290771, + "num_tokens": 760676553.0, + "step": 19933 + }, + { + "epoch": 2.5358096934232286, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8742425441741943, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.879855751991272, + "num_tokens": 760708850.0, + "step": 19934 + }, + { + "epoch": 2.535936903701819, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.5623466968536377, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8602003455162048, + "num_tokens": 760746088.0, + "step": 19935 + }, + { + "epoch": 2.5360641139804097, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8273375034332275, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8706496953964233, + "num_tokens": 760785716.0, + "step": 19936 + }, + { + "epoch": 2.536191324259, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8333438634872437, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.87351393699646, + "num_tokens": 760821301.0, + "step": 19937 + }, + { + "epoch": 2.5363185345375907, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.848799228668213, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8549621105194092, + "num_tokens": 760861200.0, + "step": 19938 + }, + { + "epoch": 2.5364457448161812, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0182456970214844, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8625580072402954, + "num_tokens": 760896398.0, + "step": 19939 + }, + { + "epoch": 2.5365729550947718, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.79243004322052, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8443341851234436, + "num_tokens": 760937300.0, + "step": 19940 + }, + { + "epoch": 2.5367001653733623, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0292768478393555, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8589023947715759, + "num_tokens": 760973726.0, + "step": 19941 + }, + { + "epoch": 2.536827375651953, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 3.627561330795288, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8624311685562134, + "num_tokens": 761009048.0, + "step": 19942 + }, + { + "epoch": 2.5369545859305433, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8784892559051514, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8627533912658691, + "num_tokens": 761050836.0, + "step": 19943 + }, + { + "epoch": 2.537081796209134, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9856489896774292, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8562300205230713, + "num_tokens": 761087517.0, + "step": 19944 + }, + { + "epoch": 2.537209006487724, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8876646757125854, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8784277439117432, + "num_tokens": 761121401.0, + "step": 19945 + }, + { + "epoch": 2.537336216766315, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0345189571380615, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8539948463439941, + "num_tokens": 761154973.0, + "step": 19946 + }, + { + "epoch": 2.537463427044905, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.935239553451538, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8729686141014099, + "num_tokens": 761194211.0, + "step": 19947 + }, + { + "epoch": 2.537590637323496, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.913020372390747, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8571203947067261, + "num_tokens": 761233486.0, + "step": 19948 + }, + { + "epoch": 2.537717847602086, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7542980909347534, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8645532131195068, + "num_tokens": 761275846.0, + "step": 19949 + }, + { + "epoch": 2.5378450578806766, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3614587783813477, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8545970916748047, + "num_tokens": 761311210.0, + "step": 19950 + }, + { + "epoch": 2.537972268159267, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9738037586212158, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8651502132415771, + "num_tokens": 761349961.0, + "step": 19951 + }, + { + "epoch": 2.5380994784378577, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.100971221923828, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8711788654327393, + "num_tokens": 761386993.0, + "step": 19952 + }, + { + "epoch": 2.538226688716448, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8466137647628784, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8551502823829651, + "num_tokens": 761426910.0, + "step": 19953 + }, + { + "epoch": 2.5383538989950387, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.762817144393921, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8682467937469482, + "num_tokens": 761466633.0, + "step": 19954 + }, + { + "epoch": 2.5384811092736292, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1078577041625977, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8512805104255676, + "num_tokens": 761497670.0, + "step": 19955 + }, + { + "epoch": 2.5386083195522198, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.90220046043396, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8616931438446045, + "num_tokens": 761534626.0, + "step": 19956 + }, + { + "epoch": 2.5387355298308103, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7375127077102661, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8717565536499023, + "num_tokens": 761581125.0, + "step": 19957 + }, + { + "epoch": 2.538862740109401, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8451719284057617, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8653901815414429, + "num_tokens": 761616533.0, + "step": 19958 + }, + { + "epoch": 2.5389899503879914, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.201343059539795, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8553416728973389, + "num_tokens": 761650615.0, + "step": 19959 + }, + { + "epoch": 2.539117160666582, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8319259881973267, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8750047087669373, + "num_tokens": 761687020.0, + "step": 19960 + }, + { + "epoch": 2.5392443709451724, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0117783546447754, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.878738522529602, + "num_tokens": 761721653.0, + "step": 19961 + }, + { + "epoch": 2.539371581223763, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8590868711471558, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.872031569480896, + "num_tokens": 761758377.0, + "step": 19962 + }, + { + "epoch": 2.5394987915023535, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.913931131362915, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8631339073181152, + "num_tokens": 761795263.0, + "step": 19963 + }, + { + "epoch": 2.539626001780944, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7922561168670654, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8756886124610901, + "num_tokens": 761834758.0, + "step": 19964 + }, + { + "epoch": 2.5397532120595345, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7840116024017334, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8577171564102173, + "num_tokens": 761880482.0, + "step": 19965 + }, + { + "epoch": 2.539880422338125, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8341407775878906, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8640015125274658, + "num_tokens": 761922494.0, + "step": 19966 + }, + { + "epoch": 2.5400076326167156, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8892158269882202, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8830370903015137, + "num_tokens": 761959462.0, + "step": 19967 + }, + { + "epoch": 2.5401348428953057, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0861620903015137, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8544777631759644, + "num_tokens": 761990316.0, + "step": 19968 + }, + { + "epoch": 2.5402620531738966, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.027743339538574, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8592745065689087, + "num_tokens": 762023671.0, + "step": 19969 + }, + { + "epoch": 2.5403892634524867, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1323187351226807, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8697246313095093, + "num_tokens": 762056719.0, + "step": 19970 + }, + { + "epoch": 2.5405164737310777, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.809965968132019, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8663338422775269, + "num_tokens": 762098802.0, + "step": 19971 + }, + { + "epoch": 2.540643684009668, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0223023891448975, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8681343793869019, + "num_tokens": 762134470.0, + "step": 19972 + }, + { + "epoch": 2.5407708942882588, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8699219226837158, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8659356832504272, + "num_tokens": 762177163.0, + "step": 19973 + }, + { + "epoch": 2.540898104566849, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 3.1507625579833984, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8731757402420044, + "num_tokens": 762217323.0, + "step": 19974 + }, + { + "epoch": 2.5410253148454394, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.2583577632904053, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8592166900634766, + "num_tokens": 762252144.0, + "step": 19975 + }, + { + "epoch": 2.54115252512403, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0575127601623535, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8559867739677429, + "num_tokens": 762282108.0, + "step": 19976 + }, + { + "epoch": 2.5412797354026204, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9921578168869019, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8689640760421753, + "num_tokens": 762318154.0, + "step": 19977 + }, + { + "epoch": 2.541406945681211, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.011399984359741, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8618181347846985, + "num_tokens": 762354704.0, + "step": 19978 + }, + { + "epoch": 2.5415341559598015, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.972041368484497, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.865935206413269, + "num_tokens": 762389943.0, + "step": 19979 + }, + { + "epoch": 2.541661366238392, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.6363279819488525, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8671364784240723, + "num_tokens": 762421560.0, + "step": 19980 + }, + { + "epoch": 2.5417885765169825, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8975491523742676, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8701642751693726, + "num_tokens": 762458509.0, + "step": 19981 + }, + { + "epoch": 2.541915786795573, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2493247985839844, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8625788688659668, + "num_tokens": 762502382.0, + "step": 19982 + }, + { + "epoch": 2.5420429970741636, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9193954467773438, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8621400594711304, + "num_tokens": 762536611.0, + "step": 19983 + }, + { + "epoch": 2.542170207352754, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.793755054473877, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8655619621276855, + "num_tokens": 762577801.0, + "step": 19984 + }, + { + "epoch": 2.5422974176313446, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0177154541015625, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8663244843482971, + "num_tokens": 762617413.0, + "step": 19985 + }, + { + "epoch": 2.542424627909935, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0763964653015137, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8744423389434814, + "num_tokens": 762654811.0, + "step": 19986 + }, + { + "epoch": 2.5425518381885257, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8436636924743652, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.869551956653595, + "num_tokens": 762692902.0, + "step": 19987 + }, + { + "epoch": 2.5426790484671162, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8122495412826538, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8669604063034058, + "num_tokens": 762731984.0, + "step": 19988 + }, + { + "epoch": 2.5428062587457068, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.839942216873169, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8744590282440186, + "num_tokens": 762767104.0, + "step": 19989 + }, + { + "epoch": 2.5429334690242973, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.853556752204895, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8542364835739136, + "num_tokens": 762804982.0, + "step": 19990 + }, + { + "epoch": 2.543060679302888, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9687583446502686, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8554166555404663, + "num_tokens": 762842956.0, + "step": 19991 + }, + { + "epoch": 2.5431878895814783, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.97796630859375, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8713885545730591, + "num_tokens": 762875445.0, + "step": 19992 + }, + { + "epoch": 2.5433150998600684, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0541341304779053, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8779443502426147, + "num_tokens": 762906061.0, + "step": 19993 + }, + { + "epoch": 2.5434423101386594, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7927390336990356, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8733890056610107, + "num_tokens": 762945550.0, + "step": 19994 + }, + { + "epoch": 2.5435695204172495, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.215451955795288, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8653069734573364, + "num_tokens": 762982586.0, + "step": 19995 + }, + { + "epoch": 2.5436967306958405, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.095654010772705, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8643055558204651, + "num_tokens": 763013527.0, + "step": 19996 + }, + { + "epoch": 2.5438239409744305, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7317135334014893, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8589591383934021, + "num_tokens": 763054363.0, + "step": 19997 + }, + { + "epoch": 2.5439511512530215, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7864997386932373, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8614109754562378, + "num_tokens": 763096834.0, + "step": 19998 + }, + { + "epoch": 2.5440783615316116, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6706490516662598, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8640440702438354, + "num_tokens": 763139611.0, + "step": 19999 + }, + { + "epoch": 2.544205571810202, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7475273609161377, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8666180968284607, + "num_tokens": 763181085.0, + "step": 20000 + }, + { + "epoch": 2.5443327820887927, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0653927326202393, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.863095223903656, + "num_tokens": 763219811.0, + "step": 20001 + }, + { + "epoch": 2.544459992367383, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9293997287750244, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8544199466705322, + "num_tokens": 763257745.0, + "step": 20002 + }, + { + "epoch": 2.5445872026459737, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9202579259872437, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8499637246131897, + "num_tokens": 763299475.0, + "step": 20003 + }, + { + "epoch": 2.5447144129245642, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3198938369750977, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8425732851028442, + "num_tokens": 763333960.0, + "step": 20004 + }, + { + "epoch": 2.5448416232031548, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7823030948638916, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8569089770317078, + "num_tokens": 763378646.0, + "step": 20005 + }, + { + "epoch": 2.5449688334817453, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9206218719482422, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8694320917129517, + "num_tokens": 763412441.0, + "step": 20006 + }, + { + "epoch": 2.545096043760336, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.946412205696106, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8688389658927917, + "num_tokens": 763446713.0, + "step": 20007 + }, + { + "epoch": 2.5452232540389264, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8970972299575806, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8751341700553894, + "num_tokens": 763478686.0, + "step": 20008 + }, + { + "epoch": 2.545350464317517, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8309082984924316, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.880563497543335, + "num_tokens": 763517258.0, + "step": 20009 + }, + { + "epoch": 2.5454776745961074, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9415932893753052, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8596676588058472, + "num_tokens": 763553229.0, + "step": 20010 + }, + { + "epoch": 2.545604884874698, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8626645803451538, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8732476830482483, + "num_tokens": 763594274.0, + "step": 20011 + }, + { + "epoch": 2.5457320951532885, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.822743535041809, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.857105016708374, + "num_tokens": 763632162.0, + "step": 20012 + }, + { + "epoch": 2.545859305431879, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.090634346008301, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8669648170471191, + "num_tokens": 763666797.0, + "step": 20013 + }, + { + "epoch": 2.5459865157104695, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9606616497039795, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8611749410629272, + "num_tokens": 763705475.0, + "step": 20014 + }, + { + "epoch": 2.54611372598906, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.929072380065918, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8800885677337646, + "num_tokens": 763739434.0, + "step": 20015 + }, + { + "epoch": 2.5462409362676506, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.079505443572998, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8514472246170044, + "num_tokens": 763773262.0, + "step": 20016 + }, + { + "epoch": 2.546368146546241, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9202710390090942, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8541805148124695, + "num_tokens": 763812573.0, + "step": 20017 + }, + { + "epoch": 2.546495356824831, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8774163722991943, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8670490980148315, + "num_tokens": 763848555.0, + "step": 20018 + }, + { + "epoch": 2.546622567103422, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0615315437316895, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8623843193054199, + "num_tokens": 763884608.0, + "step": 20019 + }, + { + "epoch": 2.5467497773820122, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9594993591308594, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8599039316177368, + "num_tokens": 763922038.0, + "step": 20020 + }, + { + "epoch": 2.546876987660603, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9740058183670044, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8614003658294678, + "num_tokens": 763958089.0, + "step": 20021 + }, + { + "epoch": 2.5470041979391933, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7686482667922974, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8657864332199097, + "num_tokens": 764000802.0, + "step": 20022 + }, + { + "epoch": 2.547131408217784, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9859284162521362, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8696403503417969, + "num_tokens": 764036628.0, + "step": 20023 + }, + { + "epoch": 2.5472586184963744, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9810913801193237, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8672658205032349, + "num_tokens": 764073643.0, + "step": 20024 + }, + { + "epoch": 2.547385828774965, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8783960342407227, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8709688186645508, + "num_tokens": 764110473.0, + "step": 20025 + }, + { + "epoch": 2.5475130390535554, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0016679763793945, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.862553060054779, + "num_tokens": 764142941.0, + "step": 20026 + }, + { + "epoch": 2.547640249332146, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.07903790473938, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8674935698509216, + "num_tokens": 764178752.0, + "step": 20027 + }, + { + "epoch": 2.5477674596107365, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0634243488311768, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8625823259353638, + "num_tokens": 764217440.0, + "step": 20028 + }, + { + "epoch": 2.547894669889327, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.056339740753174, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8540060520172119, + "num_tokens": 764257524.0, + "step": 20029 + }, + { + "epoch": 2.5480218801679175, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0215752124786377, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8586961627006531, + "num_tokens": 764293645.0, + "step": 20030 + }, + { + "epoch": 2.548149090446508, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0183210372924805, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8721877932548523, + "num_tokens": 764331660.0, + "step": 20031 + }, + { + "epoch": 2.5482763007250986, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.645370364189148, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8835793733596802, + "num_tokens": 764376577.0, + "step": 20032 + }, + { + "epoch": 2.548403511003689, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9311883449554443, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8630483150482178, + "num_tokens": 764415492.0, + "step": 20033 + }, + { + "epoch": 2.5485307212822796, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.842248558998108, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.855485200881958, + "num_tokens": 764453948.0, + "step": 20034 + }, + { + "epoch": 2.54865793156087, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 7.714356899261475, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8687474727630615, + "num_tokens": 764493906.0, + "step": 20035 + }, + { + "epoch": 2.5487851418394607, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0648033618927, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8586859703063965, + "num_tokens": 764532379.0, + "step": 20036 + }, + { + "epoch": 2.5489123521180512, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.921185255050659, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8460001945495605, + "num_tokens": 764567726.0, + "step": 20037 + }, + { + "epoch": 2.5490395623966418, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1390819549560547, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8641875386238098, + "num_tokens": 764606229.0, + "step": 20038 + }, + { + "epoch": 2.5491667726752323, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0358726978302, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8766476511955261, + "num_tokens": 764641346.0, + "step": 20039 + }, + { + "epoch": 2.549293982953823, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7770224809646606, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8649545311927795, + "num_tokens": 764687286.0, + "step": 20040 + }, + { + "epoch": 2.5494211932324133, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7422608137130737, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8690001964569092, + "num_tokens": 764729235.0, + "step": 20041 + }, + { + "epoch": 2.549548403511004, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7210946083068848, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8577347993850708, + "num_tokens": 764771834.0, + "step": 20042 + }, + { + "epoch": 2.549675613789594, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8151590824127197, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8643449544906616, + "num_tokens": 764808307.0, + "step": 20043 + }, + { + "epoch": 2.549802824068185, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7926586866378784, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8585001230239868, + "num_tokens": 764848373.0, + "step": 20044 + }, + { + "epoch": 2.549930034346775, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9317290782928467, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8606778383255005, + "num_tokens": 764886468.0, + "step": 20045 + }, + { + "epoch": 2.550057244625366, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.784018635749817, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8682378530502319, + "num_tokens": 764928690.0, + "step": 20046 + }, + { + "epoch": 2.550184454903956, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8845112323760986, + "learning_rate": 1e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.8307726383209229, + "num_tokens": 764969148.0, + "step": 20047 + }, + { + "epoch": 2.5503116651825466, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 7.786326885223389, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8591251373291016, + "num_tokens": 765010754.0, + "step": 20048 + }, + { + "epoch": 2.550438875461137, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0305733680725098, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.868336021900177, + "num_tokens": 765043562.0, + "step": 20049 + }, + { + "epoch": 2.5505660857397277, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.989655613899231, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8449375033378601, + "num_tokens": 765085638.0, + "step": 20050 + }, + { + "epoch": 2.550693296018318, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.004281520843506, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8702934980392456, + "num_tokens": 765115847.0, + "step": 20051 + }, + { + "epoch": 2.5508205062969087, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.022862434387207, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8646166324615479, + "num_tokens": 765146839.0, + "step": 20052 + }, + { + "epoch": 2.5509477165754992, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0110127925872803, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8651868104934692, + "num_tokens": 765185921.0, + "step": 20053 + }, + { + "epoch": 2.5510749268540898, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.364043712615967, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8631800413131714, + "num_tokens": 765224534.0, + "step": 20054 + }, + { + "epoch": 2.5512021371326803, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7263102531433105, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8737576603889465, + "num_tokens": 765267160.0, + "step": 20055 + }, + { + "epoch": 2.551329347411271, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8667131662368774, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8658215403556824, + "num_tokens": 765304770.0, + "step": 20056 + }, + { + "epoch": 2.5514565576898613, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9001878499984741, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.860236406326294, + "num_tokens": 765341966.0, + "step": 20057 + }, + { + "epoch": 2.551583767968452, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8550254106521606, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8804173469543457, + "num_tokens": 765382653.0, + "step": 20058 + }, + { + "epoch": 2.5517109782470424, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9477229118347168, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8635439872741699, + "num_tokens": 765414940.0, + "step": 20059 + }, + { + "epoch": 2.551838188525633, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.933829665184021, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8663667440414429, + "num_tokens": 765445549.0, + "step": 20060 + }, + { + "epoch": 2.5519653988042235, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8086053133010864, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8719466924667358, + "num_tokens": 765484474.0, + "step": 20061 + }, + { + "epoch": 2.552092609082814, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8957477807998657, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8642479777336121, + "num_tokens": 765522651.0, + "step": 20062 + }, + { + "epoch": 2.5522198193614045, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8042757511138916, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8602383136749268, + "num_tokens": 765565983.0, + "step": 20063 + }, + { + "epoch": 2.552347029639995, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9424813985824585, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8661103248596191, + "num_tokens": 765604056.0, + "step": 20064 + }, + { + "epoch": 2.5524742399185856, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7647618055343628, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8731780648231506, + "num_tokens": 765646062.0, + "step": 20065 + }, + { + "epoch": 2.5526014501971757, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8474504947662354, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8632253408432007, + "num_tokens": 765687284.0, + "step": 20066 + }, + { + "epoch": 2.5527286604757666, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0054054260253906, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8530988693237305, + "num_tokens": 765723743.0, + "step": 20067 + }, + { + "epoch": 2.5528558707543567, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7745267152786255, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8629039525985718, + "num_tokens": 765768472.0, + "step": 20068 + }, + { + "epoch": 2.5529830810329477, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7818288803100586, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8809309005737305, + "num_tokens": 765809040.0, + "step": 20069 + }, + { + "epoch": 2.5531102913115378, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0285165309906006, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8659185767173767, + "num_tokens": 765845968.0, + "step": 20070 + }, + { + "epoch": 2.5532375015901287, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8883280754089355, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8704650402069092, + "num_tokens": 765881037.0, + "step": 20071 + }, + { + "epoch": 2.553364711868719, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8819407224655151, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.869126558303833, + "num_tokens": 765918945.0, + "step": 20072 + }, + { + "epoch": 2.5534919221473094, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8304394483566284, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8777889609336853, + "num_tokens": 765956128.0, + "step": 20073 + }, + { + "epoch": 2.5536191324259, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.841490387916565, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8778316974639893, + "num_tokens": 765997109.0, + "step": 20074 + }, + { + "epoch": 2.5537463427044904, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9532740116119385, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8611987829208374, + "num_tokens": 766032557.0, + "step": 20075 + }, + { + "epoch": 2.553873552983081, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.870182991027832, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8626422882080078, + "num_tokens": 766075285.0, + "step": 20076 + }, + { + "epoch": 2.5540007632616715, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.003803253173828, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.862397313117981, + "num_tokens": 766110600.0, + "step": 20077 + }, + { + "epoch": 2.554127973540262, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9886900186538696, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8525506854057312, + "num_tokens": 766146877.0, + "step": 20078 + }, + { + "epoch": 2.5542551838188525, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8312137126922607, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8616564273834229, + "num_tokens": 766185164.0, + "step": 20079 + }, + { + "epoch": 2.554382394097443, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.973612904548645, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8851185441017151, + "num_tokens": 766221442.0, + "step": 20080 + }, + { + "epoch": 2.5545096043760336, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0063743591308594, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8414421081542969, + "num_tokens": 766255258.0, + "step": 20081 + }, + { + "epoch": 2.554636814654624, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9191062450408936, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8627212047576904, + "num_tokens": 766292392.0, + "step": 20082 + }, + { + "epoch": 2.5547640249332146, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8018782138824463, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8551335334777832, + "num_tokens": 766335312.0, + "step": 20083 + }, + { + "epoch": 2.554891235211805, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9459381103515625, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8668938875198364, + "num_tokens": 766372189.0, + "step": 20084 + }, + { + "epoch": 2.5550184454903957, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.923764944076538, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8545737266540527, + "num_tokens": 766414021.0, + "step": 20085 + }, + { + "epoch": 2.5551456557689862, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2363781929016113, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8792978525161743, + "num_tokens": 766452005.0, + "step": 20086 + }, + { + "epoch": 2.5552728660475768, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8838248252868652, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8580271601676941, + "num_tokens": 766492150.0, + "step": 20087 + }, + { + "epoch": 2.5554000763261673, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8385332822799683, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8755409717559814, + "num_tokens": 766533889.0, + "step": 20088 + }, + { + "epoch": 2.555527286604758, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0466244220733643, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8660339117050171, + "num_tokens": 766565008.0, + "step": 20089 + }, + { + "epoch": 2.5556544968833483, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9653499126434326, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8647398948669434, + "num_tokens": 766605744.0, + "step": 20090 + }, + { + "epoch": 2.5557817071619384, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.012295722961426, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8731911182403564, + "num_tokens": 766645782.0, + "step": 20091 + }, + { + "epoch": 2.5559089174405294, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.019878387451172, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8544527292251587, + "num_tokens": 766683135.0, + "step": 20092 + }, + { + "epoch": 2.5560361277191195, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.934088110923767, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8660095930099487, + "num_tokens": 766720691.0, + "step": 20093 + }, + { + "epoch": 2.5561633379977104, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0662996768951416, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.860393762588501, + "num_tokens": 766755426.0, + "step": 20094 + }, + { + "epoch": 2.5562905482763005, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0753583908081055, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8705859780311584, + "num_tokens": 766788315.0, + "step": 20095 + }, + { + "epoch": 2.5564177585548915, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7061028480529785, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8805782794952393, + "num_tokens": 766825664.0, + "step": 20096 + }, + { + "epoch": 2.5565449688334816, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2565746307373047, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8765604496002197, + "num_tokens": 766868145.0, + "step": 20097 + }, + { + "epoch": 2.556672179112072, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.418825626373291, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8532379865646362, + "num_tokens": 766907278.0, + "step": 20098 + }, + { + "epoch": 2.5567993893906626, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1788341999053955, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.877169132232666, + "num_tokens": 766941662.0, + "step": 20099 + }, + { + "epoch": 2.556926599669253, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.279670476913452, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8468188047409058, + "num_tokens": 766977725.0, + "step": 20100 + }, + { + "epoch": 2.5570538099478437, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9247078895568848, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8593529462814331, + "num_tokens": 767011011.0, + "step": 20101 + }, + { + "epoch": 2.5571810202264342, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8781956434249878, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8635051250457764, + "num_tokens": 767047078.0, + "step": 20102 + }, + { + "epoch": 2.5573082305050248, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7558772563934326, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8608808517456055, + "num_tokens": 767089208.0, + "step": 20103 + }, + { + "epoch": 2.5574354407836153, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8494449853897095, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8497800827026367, + "num_tokens": 767131225.0, + "step": 20104 + }, + { + "epoch": 2.557562651062206, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9806230068206787, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8523036241531372, + "num_tokens": 767165222.0, + "step": 20105 + }, + { + "epoch": 2.5576898613407963, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.927986979484558, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8651028871536255, + "num_tokens": 767200870.0, + "step": 20106 + }, + { + "epoch": 2.557817071619387, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.988816738128662, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8628596663475037, + "num_tokens": 767236881.0, + "step": 20107 + }, + { + "epoch": 2.5579442818979774, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9541782140731812, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8690177202224731, + "num_tokens": 767269716.0, + "step": 20108 + }, + { + "epoch": 2.558071492176568, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.958709478378296, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8552712202072144, + "num_tokens": 767306840.0, + "step": 20109 + }, + { + "epoch": 2.5581987024551585, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0238285064697266, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8769710063934326, + "num_tokens": 767346966.0, + "step": 20110 + }, + { + "epoch": 2.558325912733749, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9722404479980469, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8621478080749512, + "num_tokens": 767387084.0, + "step": 20111 + }, + { + "epoch": 2.5584531230123395, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.272336721420288, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.844558596611023, + "num_tokens": 767417857.0, + "step": 20112 + }, + { + "epoch": 2.55858033329093, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.969292402267456, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8668282628059387, + "num_tokens": 767455771.0, + "step": 20113 + }, + { + "epoch": 2.5587075435695206, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8110171556472778, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8528264164924622, + "num_tokens": 767493187.0, + "step": 20114 + }, + { + "epoch": 2.558834753848111, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.823245882987976, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.857540488243103, + "num_tokens": 767534526.0, + "step": 20115 + }, + { + "epoch": 2.558961964126701, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0397114753723145, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8620231747627258, + "num_tokens": 767569297.0, + "step": 20116 + }, + { + "epoch": 2.559089174405292, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.002866506576538, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.861033022403717, + "num_tokens": 767605860.0, + "step": 20117 + }, + { + "epoch": 2.5592163846838822, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.81498122215271, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8670908212661743, + "num_tokens": 767645252.0, + "step": 20118 + }, + { + "epoch": 2.559343594962473, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.141637086868286, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8645176887512207, + "num_tokens": 767681942.0, + "step": 20119 + }, + { + "epoch": 2.5594708052410633, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9449504613876343, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8650825023651123, + "num_tokens": 767720631.0, + "step": 20120 + }, + { + "epoch": 2.559598015519654, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8915252685546875, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8689956665039062, + "num_tokens": 767757018.0, + "step": 20121 + }, + { + "epoch": 2.5597252257982444, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1173460483551025, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8685998320579529, + "num_tokens": 767796046.0, + "step": 20122 + }, + { + "epoch": 2.559852436076835, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8152016401290894, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8761990666389465, + "num_tokens": 767835678.0, + "step": 20123 + }, + { + "epoch": 2.5599796463554254, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.33880352973938, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8790918588638306, + "num_tokens": 767870515.0, + "step": 20124 + }, + { + "epoch": 2.560106856634016, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.928182601928711, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8493047952651978, + "num_tokens": 767911573.0, + "step": 20125 + }, + { + "epoch": 2.5602340669126065, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8725634813308716, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8625931739807129, + "num_tokens": 767950161.0, + "step": 20126 + }, + { + "epoch": 2.560361277191197, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8510448932647705, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8614361882209778, + "num_tokens": 767989431.0, + "step": 20127 + }, + { + "epoch": 2.5604884874697875, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0400543212890625, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8747401833534241, + "num_tokens": 768023551.0, + "step": 20128 + }, + { + "epoch": 2.560615697748378, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9733881950378418, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8728715777397156, + "num_tokens": 768055317.0, + "step": 20129 + }, + { + "epoch": 2.5607429080269686, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0311200618743896, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8563629984855652, + "num_tokens": 768089575.0, + "step": 20130 + }, + { + "epoch": 2.560870118305559, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.065967082977295, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8549147844314575, + "num_tokens": 768131795.0, + "step": 20131 + }, + { + "epoch": 2.5609973285841496, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9020010232925415, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8695726990699768, + "num_tokens": 768165714.0, + "step": 20132 + }, + { + "epoch": 2.56112453886274, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8221073150634766, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8739169836044312, + "num_tokens": 768203475.0, + "step": 20133 + }, + { + "epoch": 2.5612517491413307, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7913657426834106, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8810114860534668, + "num_tokens": 768244841.0, + "step": 20134 + }, + { + "epoch": 2.561378959419921, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8040353059768677, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8611009120941162, + "num_tokens": 768286502.0, + "step": 20135 + }, + { + "epoch": 2.5615061696985117, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.899395227432251, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8726906776428223, + "num_tokens": 768320628.0, + "step": 20136 + }, + { + "epoch": 2.5616333799771023, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.067669630050659, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8577742576599121, + "num_tokens": 768353630.0, + "step": 20137 + }, + { + "epoch": 2.561760590255693, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8815686702728271, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8639578223228455, + "num_tokens": 768388170.0, + "step": 20138 + }, + { + "epoch": 2.5618878005342833, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8281700611114502, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8817691802978516, + "num_tokens": 768428238.0, + "step": 20139 + }, + { + "epoch": 2.562015010812874, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2055206298828125, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8789764642715454, + "num_tokens": 768460075.0, + "step": 20140 + }, + { + "epoch": 2.562142221091464, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.056000232696533, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8826556205749512, + "num_tokens": 768493982.0, + "step": 20141 + }, + { + "epoch": 2.562269431370055, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.107448101043701, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8684660196304321, + "num_tokens": 768529121.0, + "step": 20142 + }, + { + "epoch": 2.562396641648645, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.154188632965088, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.865264892578125, + "num_tokens": 768560730.0, + "step": 20143 + }, + { + "epoch": 2.562523851927236, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7286783456802368, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8770005702972412, + "num_tokens": 768603015.0, + "step": 20144 + }, + { + "epoch": 2.562651062205826, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9395129680633545, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8626739382743835, + "num_tokens": 768639847.0, + "step": 20145 + }, + { + "epoch": 2.5627782724844166, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1209256649017334, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8623054027557373, + "num_tokens": 768681614.0, + "step": 20146 + }, + { + "epoch": 2.562905482763007, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.844226360321045, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8673853278160095, + "num_tokens": 768722052.0, + "step": 20147 + }, + { + "epoch": 2.5630326930415976, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.045661211013794, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8565424084663391, + "num_tokens": 768754191.0, + "step": 20148 + }, + { + "epoch": 2.563159903320188, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8104970455169678, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8544677495956421, + "num_tokens": 768792981.0, + "step": 20149 + }, + { + "epoch": 2.5632871135987787, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.930493950843811, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8550631403923035, + "num_tokens": 768827040.0, + "step": 20150 + }, + { + "epoch": 2.5634143238773692, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.942181944847107, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8638852834701538, + "num_tokens": 768865669.0, + "step": 20151 + }, + { + "epoch": 2.5635415341559598, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9261500835418701, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8637391924858093, + "num_tokens": 768903306.0, + "step": 20152 + }, + { + "epoch": 2.5636687444345503, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8244976997375488, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8688144683837891, + "num_tokens": 768946322.0, + "step": 20153 + }, + { + "epoch": 2.563795954713141, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6759164333343506, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.882804274559021, + "num_tokens": 768987350.0, + "step": 20154 + }, + { + "epoch": 2.5639231649917313, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6800320148468018, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8752628564834595, + "num_tokens": 769030762.0, + "step": 20155 + }, + { + "epoch": 2.564050375270322, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8757836818695068, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8774842619895935, + "num_tokens": 769069827.0, + "step": 20156 + }, + { + "epoch": 2.5641775855489124, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.345836877822876, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8644089698791504, + "num_tokens": 769107742.0, + "step": 20157 + }, + { + "epoch": 2.564304795827503, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9142796993255615, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8625852465629578, + "num_tokens": 769153286.0, + "step": 20158 + }, + { + "epoch": 2.5644320061060935, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8382664918899536, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8674624562263489, + "num_tokens": 769192646.0, + "step": 20159 + }, + { + "epoch": 2.564559216384684, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9272518157958984, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8424656391143799, + "num_tokens": 769230360.0, + "step": 20160 + }, + { + "epoch": 2.5646864266632745, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9473384618759155, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8792951107025146, + "num_tokens": 769270754.0, + "step": 20161 + }, + { + "epoch": 2.564813636941865, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.129857063293457, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.841669499874115, + "num_tokens": 769310109.0, + "step": 20162 + }, + { + "epoch": 2.5649408472204556, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8204436302185059, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8635847568511963, + "num_tokens": 769352101.0, + "step": 20163 + }, + { + "epoch": 2.5650680574990457, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8630411624908447, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8800299763679504, + "num_tokens": 769386570.0, + "step": 20164 + }, + { + "epoch": 2.5651952677776366, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7056999206542969, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8777376413345337, + "num_tokens": 769427167.0, + "step": 20165 + }, + { + "epoch": 2.5653224780562267, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8730062246322632, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8536908626556396, + "num_tokens": 769465456.0, + "step": 20166 + }, + { + "epoch": 2.5654496883348177, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0624160766601562, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8784018754959106, + "num_tokens": 769497135.0, + "step": 20167 + }, + { + "epoch": 2.5655768986134078, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0191221237182617, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8596547842025757, + "num_tokens": 769533706.0, + "step": 20168 + }, + { + "epoch": 2.5657041088919987, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8563790321350098, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8599215149879456, + "num_tokens": 769578698.0, + "step": 20169 + }, + { + "epoch": 2.565831319170589, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8248167037963867, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8791031837463379, + "num_tokens": 769619799.0, + "step": 20170 + }, + { + "epoch": 2.5659585294491793, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9280604124069214, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8648627400398254, + "num_tokens": 769655973.0, + "step": 20171 + }, + { + "epoch": 2.56608573972777, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.026548385620117, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8604198694229126, + "num_tokens": 769692333.0, + "step": 20172 + }, + { + "epoch": 2.5662129500063604, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8608629703521729, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8693457841873169, + "num_tokens": 769731804.0, + "step": 20173 + }, + { + "epoch": 2.566340160284951, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8834744691848755, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8703912496566772, + "num_tokens": 769767409.0, + "step": 20174 + }, + { + "epoch": 2.5664673705635415, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8709713220596313, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8611964583396912, + "num_tokens": 769806468.0, + "step": 20175 + }, + { + "epoch": 2.566594580842132, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8439738750457764, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8722634315490723, + "num_tokens": 769842025.0, + "step": 20176 + }, + { + "epoch": 2.5667217911207225, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7836703062057495, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8600826263427734, + "num_tokens": 769884125.0, + "step": 20177 + }, + { + "epoch": 2.566849001399313, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9886759519577026, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.853131890296936, + "num_tokens": 769920523.0, + "step": 20178 + }, + { + "epoch": 2.5669762116779036, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0165419578552246, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8624318838119507, + "num_tokens": 769960141.0, + "step": 20179 + }, + { + "epoch": 2.567103421956494, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9946931600570679, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8675277233123779, + "num_tokens": 769995281.0, + "step": 20180 + }, + { + "epoch": 2.5672306322350846, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 3.937286376953125, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8684819340705872, + "num_tokens": 770028230.0, + "step": 20181 + }, + { + "epoch": 2.567357842513675, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.202979564666748, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8719876408576965, + "num_tokens": 770068996.0, + "step": 20182 + }, + { + "epoch": 2.5674850527922657, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.002490520477295, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8537523746490479, + "num_tokens": 770107018.0, + "step": 20183 + }, + { + "epoch": 2.567612263070856, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.032043218612671, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8646278977394104, + "num_tokens": 770140251.0, + "step": 20184 + }, + { + "epoch": 2.5677394733494467, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.798691987991333, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8567638397216797, + "num_tokens": 770182241.0, + "step": 20185 + }, + { + "epoch": 2.5678666836280373, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8958499431610107, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.879895031452179, + "num_tokens": 770223267.0, + "step": 20186 + }, + { + "epoch": 2.567993893906628, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.593942642211914, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8538029193878174, + "num_tokens": 770261856.0, + "step": 20187 + }, + { + "epoch": 2.5681211041852183, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9163074493408203, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8543514013290405, + "num_tokens": 770299762.0, + "step": 20188 + }, + { + "epoch": 2.5682483144638084, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.232175350189209, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8596179485321045, + "num_tokens": 770333690.0, + "step": 20189 + }, + { + "epoch": 2.5683755247423994, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8141608238220215, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8706625699996948, + "num_tokens": 770375755.0, + "step": 20190 + }, + { + "epoch": 2.5685027350209895, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8704420328140259, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8750389814376831, + "num_tokens": 770417805.0, + "step": 20191 + }, + { + "epoch": 2.5686299452995804, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.847550868988037, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.864726185798645, + "num_tokens": 770458183.0, + "step": 20192 + }, + { + "epoch": 2.5687571555781705, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3314993381500244, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8667908906936646, + "num_tokens": 770488973.0, + "step": 20193 + }, + { + "epoch": 2.5688843658567615, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0606119632720947, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8593570590019226, + "num_tokens": 770529030.0, + "step": 20194 + }, + { + "epoch": 2.5690115761353516, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3719429969787598, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8545793294906616, + "num_tokens": 770574697.0, + "step": 20195 + }, + { + "epoch": 2.569138786413942, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8266732692718506, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8728862404823303, + "num_tokens": 770615417.0, + "step": 20196 + }, + { + "epoch": 2.5692659966925326, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9634997844696045, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8506230115890503, + "num_tokens": 770653228.0, + "step": 20197 + }, + { + "epoch": 2.569393206971123, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8674266338348389, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8815550804138184, + "num_tokens": 770693876.0, + "step": 20198 + }, + { + "epoch": 2.5695204172497137, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8185616731643677, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.863847553730011, + "num_tokens": 770728754.0, + "step": 20199 + }, + { + "epoch": 2.5696476275283042, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9981294870376587, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8633606433868408, + "num_tokens": 770767961.0, + "step": 20200 + }, + { + "epoch": 2.5697748378068948, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9855964183807373, + "learning_rate": 1e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.8379731178283691, + "num_tokens": 770808880.0, + "step": 20201 + }, + { + "epoch": 2.5699020480854853, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.954696536064148, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8600873947143555, + "num_tokens": 770846483.0, + "step": 20202 + }, + { + "epoch": 2.570029258364076, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8541743755340576, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8453776240348816, + "num_tokens": 770884234.0, + "step": 20203 + }, + { + "epoch": 2.5701564686426663, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0462968349456787, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8500473499298096, + "num_tokens": 770918785.0, + "step": 20204 + }, + { + "epoch": 2.570283678921257, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.315906286239624, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8869081735610962, + "num_tokens": 770956889.0, + "step": 20205 + }, + { + "epoch": 2.5704108891998474, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1846859455108643, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8533850908279419, + "num_tokens": 770994937.0, + "step": 20206 + }, + { + "epoch": 2.570538099478438, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9171019792556763, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8725747466087341, + "num_tokens": 771031451.0, + "step": 20207 + }, + { + "epoch": 2.5706653097570284, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0689289569854736, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8736328482627869, + "num_tokens": 771061575.0, + "step": 20208 + }, + { + "epoch": 2.570792520035619, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.758172869682312, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8654201030731201, + "num_tokens": 771103366.0, + "step": 20209 + }, + { + "epoch": 2.5709197303142095, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8027725219726562, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.870063066482544, + "num_tokens": 771142178.0, + "step": 20210 + }, + { + "epoch": 2.5710469405928, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8222308158874512, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8649958372116089, + "num_tokens": 771179296.0, + "step": 20211 + }, + { + "epoch": 2.5711741508713906, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8894013166427612, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8575531244277954, + "num_tokens": 771219084.0, + "step": 20212 + }, + { + "epoch": 2.571301361149981, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.028367757797241, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8716063499450684, + "num_tokens": 771250609.0, + "step": 20213 + }, + { + "epoch": 2.571428571428571, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.00081467628479, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8644378185272217, + "num_tokens": 771286191.0, + "step": 20214 + }, + { + "epoch": 2.571555781707162, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0789401531219482, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8630096912384033, + "num_tokens": 771320898.0, + "step": 20215 + }, + { + "epoch": 2.5716829919857522, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0983726978302, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8564757108688354, + "num_tokens": 771353977.0, + "step": 20216 + }, + { + "epoch": 2.571810202264343, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9622999429702759, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8711852431297302, + "num_tokens": 771391960.0, + "step": 20217 + }, + { + "epoch": 2.5719374125429333, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9400030374526978, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8564257621765137, + "num_tokens": 771427764.0, + "step": 20218 + }, + { + "epoch": 2.572064622821524, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7898881435394287, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8654782772064209, + "num_tokens": 771469706.0, + "step": 20219 + }, + { + "epoch": 2.5721918331001143, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.854017734527588, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8857589960098267, + "num_tokens": 771504296.0, + "step": 20220 + }, + { + "epoch": 2.572319043378705, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8647935390472412, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8645499348640442, + "num_tokens": 771544496.0, + "step": 20221 + }, + { + "epoch": 2.5724462536572954, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.887956976890564, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8738936185836792, + "num_tokens": 771584322.0, + "step": 20222 + }, + { + "epoch": 2.572573463935886, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7816975116729736, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8755866289138794, + "num_tokens": 771621424.0, + "step": 20223 + }, + { + "epoch": 2.5727006742144765, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9681661128997803, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8655722141265869, + "num_tokens": 771658173.0, + "step": 20224 + }, + { + "epoch": 2.572827884493067, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0025196075439453, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8565930128097534, + "num_tokens": 771698182.0, + "step": 20225 + }, + { + "epoch": 2.5729550947716575, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1097371578216553, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.849997878074646, + "num_tokens": 771731559.0, + "step": 20226 + }, + { + "epoch": 2.573082305050248, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8789031505584717, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8620625734329224, + "num_tokens": 771774362.0, + "step": 20227 + }, + { + "epoch": 2.5732095153288386, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.147120714187622, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8542429208755493, + "num_tokens": 771810540.0, + "step": 20228 + }, + { + "epoch": 2.573336725607429, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9041614532470703, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8683497905731201, + "num_tokens": 771846221.0, + "step": 20229 + }, + { + "epoch": 2.5734639358860196, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8593891859054565, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8739012479782104, + "num_tokens": 771882268.0, + "step": 20230 + }, + { + "epoch": 2.57359114616461, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0630483627319336, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8628544211387634, + "num_tokens": 771913176.0, + "step": 20231 + }, + { + "epoch": 2.5737183564432007, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.935684084892273, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8595567941665649, + "num_tokens": 771948163.0, + "step": 20232 + }, + { + "epoch": 2.573845566721791, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8258005380630493, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8773618936538696, + "num_tokens": 771988296.0, + "step": 20233 + }, + { + "epoch": 2.5739727770003817, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.989220142364502, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8729301691055298, + "num_tokens": 772024330.0, + "step": 20234 + }, + { + "epoch": 2.5740999872789723, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0916085243225098, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.853033721446991, + "num_tokens": 772062291.0, + "step": 20235 + }, + { + "epoch": 2.574227197557563, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7198894023895264, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8659566640853882, + "num_tokens": 772110239.0, + "step": 20236 + }, + { + "epoch": 2.5743544078361533, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.972205400466919, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8610509634017944, + "num_tokens": 772153201.0, + "step": 20237 + }, + { + "epoch": 2.574481618114744, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9864782094955444, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8667052984237671, + "num_tokens": 772186816.0, + "step": 20238 + }, + { + "epoch": 2.574608828393334, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9104104042053223, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.875183641910553, + "num_tokens": 772224238.0, + "step": 20239 + }, + { + "epoch": 2.574736038671925, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.814633846282959, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8752813339233398, + "num_tokens": 772263505.0, + "step": 20240 + }, + { + "epoch": 2.574863248950515, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8387731313705444, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8734390735626221, + "num_tokens": 772303205.0, + "step": 20241 + }, + { + "epoch": 2.574990459229106, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0162270069122314, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.871111273765564, + "num_tokens": 772344495.0, + "step": 20242 + }, + { + "epoch": 2.575117669507696, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0553762912750244, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8602895736694336, + "num_tokens": 772380296.0, + "step": 20243 + }, + { + "epoch": 2.5752448797862866, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.065800666809082, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8795223236083984, + "num_tokens": 772423933.0, + "step": 20244 + }, + { + "epoch": 2.575372090064877, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9205586910247803, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.850095272064209, + "num_tokens": 772464514.0, + "step": 20245 + }, + { + "epoch": 2.5754993003434676, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7842463254928589, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8836669325828552, + "num_tokens": 772498501.0, + "step": 20246 + }, + { + "epoch": 2.575626510622058, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8593941926956177, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8603928089141846, + "num_tokens": 772538072.0, + "step": 20247 + }, + { + "epoch": 2.5757537209006487, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8605676889419556, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8651045560836792, + "num_tokens": 772577434.0, + "step": 20248 + }, + { + "epoch": 2.575880931179239, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 5.063357830047607, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8802789449691772, + "num_tokens": 772612917.0, + "step": 20249 + }, + { + "epoch": 2.5760081414578297, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9523968696594238, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8765654563903809, + "num_tokens": 772652939.0, + "step": 20250 + }, + { + "epoch": 2.5761353517364203, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.1059300899505615, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8727549910545349, + "num_tokens": 772688018.0, + "step": 20251 + }, + { + "epoch": 2.576262562015011, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.927648663520813, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8549520969390869, + "num_tokens": 772728071.0, + "step": 20252 + }, + { + "epoch": 2.5763897722936013, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.010927438735962, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8655388355255127, + "num_tokens": 772762730.0, + "step": 20253 + }, + { + "epoch": 2.576516982572192, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7996071577072144, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8687539100646973, + "num_tokens": 772799125.0, + "step": 20254 + }, + { + "epoch": 2.5766441928507824, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9047657251358032, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8774610161781311, + "num_tokens": 772838657.0, + "step": 20255 + }, + { + "epoch": 2.576771403129373, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9072057008743286, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8714189529418945, + "num_tokens": 772876470.0, + "step": 20256 + }, + { + "epoch": 2.5768986134079634, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.931693196296692, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8591598272323608, + "num_tokens": 772910369.0, + "step": 20257 + }, + { + "epoch": 2.577025823686554, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9958233833312988, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8720117807388306, + "num_tokens": 772948129.0, + "step": 20258 + }, + { + "epoch": 2.5771530339651445, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8773484230041504, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8916746377944946, + "num_tokens": 772985359.0, + "step": 20259 + }, + { + "epoch": 2.577280244243735, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9905972480773926, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8667243719100952, + "num_tokens": 773019402.0, + "step": 20260 + }, + { + "epoch": 2.5774074545223256, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9653950929641724, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8655037879943848, + "num_tokens": 773059176.0, + "step": 20261 + }, + { + "epoch": 2.5775346648009156, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8891593217849731, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8671665191650391, + "num_tokens": 773094274.0, + "step": 20262 + }, + { + "epoch": 2.5776618750795066, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7700036764144897, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8645612001419067, + "num_tokens": 773135679.0, + "step": 20263 + }, + { + "epoch": 2.5777890853580967, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9601465463638306, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8610758185386658, + "num_tokens": 773174270.0, + "step": 20264 + }, + { + "epoch": 2.5779162956366877, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8336299657821655, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8685817718505859, + "num_tokens": 773213582.0, + "step": 20265 + }, + { + "epoch": 2.5780435059152778, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.380361318588257, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.861854076385498, + "num_tokens": 773244980.0, + "step": 20266 + }, + { + "epoch": 2.5781707161938687, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9677284955978394, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8795187473297119, + "num_tokens": 773287395.0, + "step": 20267 + }, + { + "epoch": 2.578297926472459, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9447883367538452, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8625727891921997, + "num_tokens": 773324593.0, + "step": 20268 + }, + { + "epoch": 2.5784251367510493, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0728261470794678, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8596150875091553, + "num_tokens": 773361802.0, + "step": 20269 + }, + { + "epoch": 2.57855234702964, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7711931467056274, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8702192902565002, + "num_tokens": 773401348.0, + "step": 20270 + }, + { + "epoch": 2.5786795573082304, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0033481121063232, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8825997710227966, + "num_tokens": 773439040.0, + "step": 20271 + }, + { + "epoch": 2.578806767586821, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0533323287963867, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8780614137649536, + "num_tokens": 773471382.0, + "step": 20272 + }, + { + "epoch": 2.5789339778654115, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9247158765792847, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8538639545440674, + "num_tokens": 773508924.0, + "step": 20273 + }, + { + "epoch": 2.579061188144002, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.791758418083191, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8796877264976501, + "num_tokens": 773550219.0, + "step": 20274 + }, + { + "epoch": 2.5791883984225925, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.863126516342163, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8605009317398071, + "num_tokens": 773589363.0, + "step": 20275 + }, + { + "epoch": 2.579315608701183, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1115663051605225, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8653597831726074, + "num_tokens": 773630289.0, + "step": 20276 + }, + { + "epoch": 2.5794428189797736, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.875578761100769, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8661270141601562, + "num_tokens": 773668660.0, + "step": 20277 + }, + { + "epoch": 2.579570029258364, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8326317071914673, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8620383739471436, + "num_tokens": 773704318.0, + "step": 20278 + }, + { + "epoch": 2.5796972395369546, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9436181783676147, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8504096269607544, + "num_tokens": 773745821.0, + "step": 20279 + }, + { + "epoch": 2.579824449815545, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9717823266983032, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8758171200752258, + "num_tokens": 773782880.0, + "step": 20280 + }, + { + "epoch": 2.5799516600941357, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0530800819396973, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8507333993911743, + "num_tokens": 773824880.0, + "step": 20281 + }, + { + "epoch": 2.580078870372726, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8918182849884033, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8683682084083557, + "num_tokens": 773863210.0, + "step": 20282 + }, + { + "epoch": 2.5802060806513167, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0021414756774902, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8728441596031189, + "num_tokens": 773903432.0, + "step": 20283 + }, + { + "epoch": 2.5803332909299073, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7973896265029907, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8740009069442749, + "num_tokens": 773942424.0, + "step": 20284 + }, + { + "epoch": 2.580460501208498, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9770795106887817, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8632610440254211, + "num_tokens": 773976598.0, + "step": 20285 + }, + { + "epoch": 2.5805877114870883, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8624989986419678, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8596014380455017, + "num_tokens": 774016696.0, + "step": 20286 + }, + { + "epoch": 2.5807149217656784, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7167677879333496, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8583393096923828, + "num_tokens": 774059046.0, + "step": 20287 + }, + { + "epoch": 2.5808421320442694, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8640453815460205, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8618411421775818, + "num_tokens": 774095322.0, + "step": 20288 + }, + { + "epoch": 2.5809693423228595, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.979909896850586, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8548715114593506, + "num_tokens": 774134506.0, + "step": 20289 + }, + { + "epoch": 2.5810965526014504, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9576747417449951, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8816211819648743, + "num_tokens": 774176997.0, + "step": 20290 + }, + { + "epoch": 2.5812237628800405, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9140340089797974, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8594105243682861, + "num_tokens": 774213900.0, + "step": 20291 + }, + { + "epoch": 2.5813509731586315, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8670862913131714, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8580044507980347, + "num_tokens": 774259687.0, + "step": 20292 + }, + { + "epoch": 2.5814781834372216, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8737074136734009, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8701828718185425, + "num_tokens": 774297200.0, + "step": 20293 + }, + { + "epoch": 2.581605393715812, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.714540719985962, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8790158629417419, + "num_tokens": 774336642.0, + "step": 20294 + }, + { + "epoch": 2.5817326039944026, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9128724336624146, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8499663472175598, + "num_tokens": 774376762.0, + "step": 20295 + }, + { + "epoch": 2.581859814272993, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7300301790237427, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8748063445091248, + "num_tokens": 774421360.0, + "step": 20296 + }, + { + "epoch": 2.5819870245515837, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.19899582862854, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8727381229400635, + "num_tokens": 774460850.0, + "step": 20297 + }, + { + "epoch": 2.582114234830174, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9475359916687012, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8641018867492676, + "num_tokens": 774497542.0, + "step": 20298 + }, + { + "epoch": 2.5822414451087647, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8466659784317017, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8561865091323853, + "num_tokens": 774538164.0, + "step": 20299 + }, + { + "epoch": 2.5823686553873553, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0646846294403076, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8608145713806152, + "num_tokens": 774576118.0, + "step": 20300 + }, + { + "epoch": 2.582495865665946, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8640837669372559, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8877679705619812, + "num_tokens": 774617506.0, + "step": 20301 + }, + { + "epoch": 2.5826230759445363, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0907950401306152, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8419718146324158, + "num_tokens": 774653720.0, + "step": 20302 + }, + { + "epoch": 2.582750286223127, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.103156805038452, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8865576386451721, + "num_tokens": 774691056.0, + "step": 20303 + }, + { + "epoch": 2.5828774965017174, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.199465274810791, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8590283393859863, + "num_tokens": 774726514.0, + "step": 20304 + }, + { + "epoch": 2.583004706780308, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9849222898483276, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8555054664611816, + "num_tokens": 774760831.0, + "step": 20305 + }, + { + "epoch": 2.5831319170588984, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.830932378768921, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8767060041427612, + "num_tokens": 774799399.0, + "step": 20306 + }, + { + "epoch": 2.583259127337489, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8356643915176392, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.85889732837677, + "num_tokens": 774839910.0, + "step": 20307 + }, + { + "epoch": 2.5833863376160795, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.035922050476074, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8680278062820435, + "num_tokens": 774875694.0, + "step": 20308 + }, + { + "epoch": 2.58351354789467, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1519832611083984, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8686777353286743, + "num_tokens": 774908020.0, + "step": 20309 + }, + { + "epoch": 2.5836407581732606, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8841469287872314, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8559200763702393, + "num_tokens": 774946716.0, + "step": 20310 + }, + { + "epoch": 2.583767968451851, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9888063669204712, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8605770468711853, + "num_tokens": 774981732.0, + "step": 20311 + }, + { + "epoch": 2.583895178730441, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9228395223617554, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.870959460735321, + "num_tokens": 775015280.0, + "step": 20312 + }, + { + "epoch": 2.584022389009032, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7917577028274536, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8733653426170349, + "num_tokens": 775054518.0, + "step": 20313 + }, + { + "epoch": 2.5841495992876222, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7420817613601685, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8727923631668091, + "num_tokens": 775091548.0, + "step": 20314 + }, + { + "epoch": 2.584276809566213, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.055590867996216, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8533465266227722, + "num_tokens": 775130916.0, + "step": 20315 + }, + { + "epoch": 2.5844040198448033, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.12803316116333, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8778063654899597, + "num_tokens": 775165875.0, + "step": 20316 + }, + { + "epoch": 2.584531230123394, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1397931575775146, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8512980937957764, + "num_tokens": 775211463.0, + "step": 20317 + }, + { + "epoch": 2.5846584404019843, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9296993017196655, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8751775026321411, + "num_tokens": 775248338.0, + "step": 20318 + }, + { + "epoch": 2.584785650680575, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9096404314041138, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8538410663604736, + "num_tokens": 775285707.0, + "step": 20319 + }, + { + "epoch": 2.5849128609591654, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.855273962020874, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8735196590423584, + "num_tokens": 775322682.0, + "step": 20320 + }, + { + "epoch": 2.585040071237756, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7663236856460571, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.876965343952179, + "num_tokens": 775358274.0, + "step": 20321 + }, + { + "epoch": 2.5851672815163464, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9426844120025635, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.861144483089447, + "num_tokens": 775394472.0, + "step": 20322 + }, + { + "epoch": 2.585294491794937, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8369901180267334, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8762401938438416, + "num_tokens": 775433647.0, + "step": 20323 + }, + { + "epoch": 2.5854217020735275, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.013606309890747, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8554778099060059, + "num_tokens": 775469159.0, + "step": 20324 + }, + { + "epoch": 2.585548912352118, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1861250400543213, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8743380904197693, + "num_tokens": 775507329.0, + "step": 20325 + }, + { + "epoch": 2.5856761226307086, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.057328224182129, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8636643290519714, + "num_tokens": 775544638.0, + "step": 20326 + }, + { + "epoch": 2.585803332909299, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8955779075622559, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.867735743522644, + "num_tokens": 775584918.0, + "step": 20327 + }, + { + "epoch": 2.5859305431878896, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.785740852355957, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8624319434165955, + "num_tokens": 775624214.0, + "step": 20328 + }, + { + "epoch": 2.58605775346648, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8264774084091187, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8540854454040527, + "num_tokens": 775668372.0, + "step": 20329 + }, + { + "epoch": 2.5861849637450707, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 7.780463695526123, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.836388349533081, + "num_tokens": 775700578.0, + "step": 20330 + }, + { + "epoch": 2.586312174023661, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8709406852722168, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.872653067111969, + "num_tokens": 775746980.0, + "step": 20331 + }, + { + "epoch": 2.5864393843022517, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.104825019836426, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.859511137008667, + "num_tokens": 775780762.0, + "step": 20332 + }, + { + "epoch": 2.5865665945808423, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9812523126602173, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8709464073181152, + "num_tokens": 775815591.0, + "step": 20333 + }, + { + "epoch": 2.586693804859433, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.909635305404663, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8580489754676819, + "num_tokens": 775856414.0, + "step": 20334 + }, + { + "epoch": 2.5868210151380233, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8896546363830566, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8660456538200378, + "num_tokens": 775896939.0, + "step": 20335 + }, + { + "epoch": 2.586948225416614, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.740079641342163, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8594493865966797, + "num_tokens": 775941778.0, + "step": 20336 + }, + { + "epoch": 2.587075435695204, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.100562572479248, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8646295070648193, + "num_tokens": 775979346.0, + "step": 20337 + }, + { + "epoch": 2.587202645973795, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.237565517425537, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8643127083778381, + "num_tokens": 776013793.0, + "step": 20338 + }, + { + "epoch": 2.587329856252385, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.930171251296997, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8778078556060791, + "num_tokens": 776058513.0, + "step": 20339 + }, + { + "epoch": 2.587457066530976, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.831539273262024, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8661679029464722, + "num_tokens": 776102790.0, + "step": 20340 + }, + { + "epoch": 2.587584276809566, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.006208658218384, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8799059987068176, + "num_tokens": 776133898.0, + "step": 20341 + }, + { + "epoch": 2.5877114870881566, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.015475273132324, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8644828796386719, + "num_tokens": 776167590.0, + "step": 20342 + }, + { + "epoch": 2.587838697366747, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7985973358154297, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8604329824447632, + "num_tokens": 776207966.0, + "step": 20343 + }, + { + "epoch": 2.5879659076453376, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9215508699417114, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8656280636787415, + "num_tokens": 776245601.0, + "step": 20344 + }, + { + "epoch": 2.588093117923928, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8383910655975342, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8739542961120605, + "num_tokens": 776285377.0, + "step": 20345 + }, + { + "epoch": 2.5882203282025187, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9422727823257446, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8580983877182007, + "num_tokens": 776328116.0, + "step": 20346 + }, + { + "epoch": 2.588347538481109, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9589334726333618, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8496313095092773, + "num_tokens": 776369008.0, + "step": 20347 + }, + { + "epoch": 2.5884747487596997, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7374014854431152, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8563511371612549, + "num_tokens": 776410151.0, + "step": 20348 + }, + { + "epoch": 2.5886019590382903, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7780742645263672, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8899763822555542, + "num_tokens": 776445944.0, + "step": 20349 + }, + { + "epoch": 2.588729169316881, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.15395450592041, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8694127798080444, + "num_tokens": 776485388.0, + "step": 20350 + }, + { + "epoch": 2.5888563795954713, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8097422122955322, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8474060297012329, + "num_tokens": 776529135.0, + "step": 20351 + }, + { + "epoch": 2.588983589874062, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0142998695373535, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.858549952507019, + "num_tokens": 776564310.0, + "step": 20352 + }, + { + "epoch": 2.5891108001526524, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6906108856201172, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8800177574157715, + "num_tokens": 776609004.0, + "step": 20353 + }, + { + "epoch": 2.589238010431243, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7712323665618896, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8724061250686646, + "num_tokens": 776645796.0, + "step": 20354 + }, + { + "epoch": 2.5893652207098334, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 20.467374801635742, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8566029667854309, + "num_tokens": 776684248.0, + "step": 20355 + }, + { + "epoch": 2.589492430988424, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.229886770248413, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8690610527992249, + "num_tokens": 776722684.0, + "step": 20356 + }, + { + "epoch": 2.5896196412670145, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9264869689941406, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8703140020370483, + "num_tokens": 776764635.0, + "step": 20357 + }, + { + "epoch": 2.589746851545605, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9816315174102783, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8543373942375183, + "num_tokens": 776805683.0, + "step": 20358 + }, + { + "epoch": 2.5898740618241956, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9820936918258667, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.867842435836792, + "num_tokens": 776841229.0, + "step": 20359 + }, + { + "epoch": 2.5900012721027856, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8148787021636963, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8728235363960266, + "num_tokens": 776881319.0, + "step": 20360 + }, + { + "epoch": 2.5901284823813766, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9739323854446411, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8707582354545593, + "num_tokens": 776919436.0, + "step": 20361 + }, + { + "epoch": 2.5902556926599667, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.903907299041748, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8479243516921997, + "num_tokens": 776958644.0, + "step": 20362 + }, + { + "epoch": 2.5903829029385577, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.912136435508728, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8822132349014282, + "num_tokens": 776994509.0, + "step": 20363 + }, + { + "epoch": 2.5905101132171477, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7392997741699219, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8713653087615967, + "num_tokens": 777037557.0, + "step": 20364 + }, + { + "epoch": 2.5906373234957387, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8513119220733643, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8456282019615173, + "num_tokens": 777077237.0, + "step": 20365 + }, + { + "epoch": 2.590764533774329, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8400483131408691, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8731863498687744, + "num_tokens": 777120813.0, + "step": 20366 + }, + { + "epoch": 2.5908917440529193, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.026027202606201, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8524938821792603, + "num_tokens": 777155809.0, + "step": 20367 + }, + { + "epoch": 2.59101895433151, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7973289489746094, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8661768436431885, + "num_tokens": 777195868.0, + "step": 20368 + }, + { + "epoch": 2.5911461646101004, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8473224639892578, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.869102418422699, + "num_tokens": 777233406.0, + "step": 20369 + }, + { + "epoch": 2.591273374888691, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8001198768615723, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8664846420288086, + "num_tokens": 777272817.0, + "step": 20370 + }, + { + "epoch": 2.5914005851672814, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9253672361373901, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8605800271034241, + "num_tokens": 777312398.0, + "step": 20371 + }, + { + "epoch": 2.591527795445872, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9788240194320679, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8762276768684387, + "num_tokens": 777349375.0, + "step": 20372 + }, + { + "epoch": 2.5916550057244625, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7792001962661743, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8745911121368408, + "num_tokens": 777394806.0, + "step": 20373 + }, + { + "epoch": 2.591782216003053, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7766145467758179, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8707371950149536, + "num_tokens": 777434350.0, + "step": 20374 + }, + { + "epoch": 2.5919094262816436, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8996000289916992, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8405733108520508, + "num_tokens": 777469690.0, + "step": 20375 + }, + { + "epoch": 2.592036636560234, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.91695499420166, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8578248023986816, + "num_tokens": 777502095.0, + "step": 20376 + }, + { + "epoch": 2.5921638468388246, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0369439125061035, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8766891956329346, + "num_tokens": 777540387.0, + "step": 20377 + }, + { + "epoch": 2.592291057117415, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.80145263671875, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8719863891601562, + "num_tokens": 777581855.0, + "step": 20378 + }, + { + "epoch": 2.5924182673960057, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8772735595703125, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.865803599357605, + "num_tokens": 777619852.0, + "step": 20379 + }, + { + "epoch": 2.592545477674596, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8358747959136963, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8587280511856079, + "num_tokens": 777662772.0, + "step": 20380 + }, + { + "epoch": 2.5926726879531867, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.89335298538208, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8491426706314087, + "num_tokens": 777701171.0, + "step": 20381 + }, + { + "epoch": 2.5927998982317773, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9755851030349731, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8522214293479919, + "num_tokens": 777738245.0, + "step": 20382 + }, + { + "epoch": 2.592927108510368, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8197736740112305, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8670176267623901, + "num_tokens": 777775732.0, + "step": 20383 + }, + { + "epoch": 2.5930543187889583, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0884573459625244, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8659977316856384, + "num_tokens": 777811077.0, + "step": 20384 + }, + { + "epoch": 2.5931815290675484, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.779796838760376, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8641358613967896, + "num_tokens": 777854107.0, + "step": 20385 + }, + { + "epoch": 2.5933087393461394, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9119716882705688, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8658947944641113, + "num_tokens": 777891950.0, + "step": 20386 + }, + { + "epoch": 2.5934359496247295, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8564201593399048, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8833646774291992, + "num_tokens": 777932638.0, + "step": 20387 + }, + { + "epoch": 2.5935631599033204, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8120712041854858, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.864369809627533, + "num_tokens": 777977733.0, + "step": 20388 + }, + { + "epoch": 2.5936903701819105, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8031708002090454, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8708552122116089, + "num_tokens": 778020451.0, + "step": 20389 + }, + { + "epoch": 2.5938175804605015, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.903549075126648, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8712608814239502, + "num_tokens": 778059594.0, + "step": 20390 + }, + { + "epoch": 2.5939447907390916, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7269518375396729, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8708476424217224, + "num_tokens": 778099235.0, + "step": 20391 + }, + { + "epoch": 2.594072001017682, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9375813007354736, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8752418756484985, + "num_tokens": 778135430.0, + "step": 20392 + }, + { + "epoch": 2.5941992112962726, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8829375505447388, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8576332330703735, + "num_tokens": 778175111.0, + "step": 20393 + }, + { + "epoch": 2.594326421574863, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9111589193344116, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8569387197494507, + "num_tokens": 778211954.0, + "step": 20394 + }, + { + "epoch": 2.5944536318534537, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.05006742477417, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8511796593666077, + "num_tokens": 778246580.0, + "step": 20395 + }, + { + "epoch": 2.594580842132044, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8290488719940186, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.871833324432373, + "num_tokens": 778288659.0, + "step": 20396 + }, + { + "epoch": 2.5947080524106347, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9355260133743286, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8640847206115723, + "num_tokens": 778325365.0, + "step": 20397 + }, + { + "epoch": 2.5948352626892253, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.944722056388855, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8730359673500061, + "num_tokens": 778362982.0, + "step": 20398 + }, + { + "epoch": 2.594962472967816, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.04164981842041, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8507055044174194, + "num_tokens": 778394230.0, + "step": 20399 + }, + { + "epoch": 2.5950896832464063, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.010087728500366, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8549243211746216, + "num_tokens": 778434345.0, + "step": 20400 + }, + { + "epoch": 2.595216893524997, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9873331785202026, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8594990968704224, + "num_tokens": 778478303.0, + "step": 20401 + }, + { + "epoch": 2.5953441038035874, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.907960295677185, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8717655539512634, + "num_tokens": 778513915.0, + "step": 20402 + }, + { + "epoch": 2.595471314082178, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.784741759300232, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8793944716453552, + "num_tokens": 778554294.0, + "step": 20403 + }, + { + "epoch": 2.5955985243607684, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7530654668807983, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8635695576667786, + "num_tokens": 778592836.0, + "step": 20404 + }, + { + "epoch": 2.595725734639359, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9205104112625122, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8565005660057068, + "num_tokens": 778629316.0, + "step": 20405 + }, + { + "epoch": 2.5958529449179495, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.273519277572632, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8603005409240723, + "num_tokens": 778659065.0, + "step": 20406 + }, + { + "epoch": 2.59598015519654, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9455024003982544, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8807052373886108, + "num_tokens": 778693949.0, + "step": 20407 + }, + { + "epoch": 2.5961073654751305, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.803436040878296, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8701770305633545, + "num_tokens": 778738045.0, + "step": 20408 + }, + { + "epoch": 2.596234575753721, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3706448078155518, + "learning_rate": 1e-06, + "loss": 0.559, + "mean_token_accuracy": 0.8209048509597778, + "num_tokens": 778769578.0, + "step": 20409 + }, + { + "epoch": 2.596361786032311, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.875538945198059, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.861923336982727, + "num_tokens": 778810052.0, + "step": 20410 + }, + { + "epoch": 2.596488996310902, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8512998819351196, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8586215972900391, + "num_tokens": 778850868.0, + "step": 20411 + }, + { + "epoch": 2.596616206589492, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0220155715942383, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8726356029510498, + "num_tokens": 778890129.0, + "step": 20412 + }, + { + "epoch": 2.596743416868083, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9765132665634155, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8698906898498535, + "num_tokens": 778923181.0, + "step": 20413 + }, + { + "epoch": 2.5968706271466733, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8942748308181763, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8575236201286316, + "num_tokens": 778963328.0, + "step": 20414 + }, + { + "epoch": 2.596997837425264, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7754286527633667, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.876423716545105, + "num_tokens": 779003932.0, + "step": 20415 + }, + { + "epoch": 2.5971250477038543, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9279072284698486, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8558657169342041, + "num_tokens": 779043913.0, + "step": 20416 + }, + { + "epoch": 2.597252257982445, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9527637958526611, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8622231483459473, + "num_tokens": 779077126.0, + "step": 20417 + }, + { + "epoch": 2.5973794682610354, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8719784021377563, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.870614230632782, + "num_tokens": 779111544.0, + "step": 20418 + }, + { + "epoch": 2.597506678539626, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.971008539199829, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8657851815223694, + "num_tokens": 779145493.0, + "step": 20419 + }, + { + "epoch": 2.5976338888182164, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7790700197219849, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8577272891998291, + "num_tokens": 779189216.0, + "step": 20420 + }, + { + "epoch": 2.597761099096807, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8999444246292114, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8722404837608337, + "num_tokens": 779225914.0, + "step": 20421 + }, + { + "epoch": 2.5978883093753975, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0070948600769043, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8713972568511963, + "num_tokens": 779264118.0, + "step": 20422 + }, + { + "epoch": 2.598015519653988, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8622283935546875, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8688293099403381, + "num_tokens": 779301694.0, + "step": 20423 + }, + { + "epoch": 2.5981427299325786, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8726035356521606, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8636805415153503, + "num_tokens": 779339012.0, + "step": 20424 + }, + { + "epoch": 2.598269940211169, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8681095838546753, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8691533207893372, + "num_tokens": 779376358.0, + "step": 20425 + }, + { + "epoch": 2.5983971504897596, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.752192735671997, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8642135858535767, + "num_tokens": 779416344.0, + "step": 20426 + }, + { + "epoch": 2.59852436076835, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.127776622772217, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8528783321380615, + "num_tokens": 779452189.0, + "step": 20427 + }, + { + "epoch": 2.5986515710469407, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7527507543563843, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8646496534347534, + "num_tokens": 779493513.0, + "step": 20428 + }, + { + "epoch": 2.598778781325531, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0413095951080322, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8706427812576294, + "num_tokens": 779533061.0, + "step": 20429 + }, + { + "epoch": 2.5989059916041217, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.795846700668335, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8599506616592407, + "num_tokens": 779570159.0, + "step": 20430 + }, + { + "epoch": 2.5990332018827123, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8418275117874146, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8634522557258606, + "num_tokens": 779609066.0, + "step": 20431 + }, + { + "epoch": 2.599160412161303, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.78999924659729, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8658033609390259, + "num_tokens": 779651087.0, + "step": 20432 + }, + { + "epoch": 2.5992876224398933, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9919487237930298, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8781480193138123, + "num_tokens": 779689531.0, + "step": 20433 + }, + { + "epoch": 2.599414832718484, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.145707607269287, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8606892228126526, + "num_tokens": 779724062.0, + "step": 20434 + }, + { + "epoch": 2.599542042997074, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0216667652130127, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8517904877662659, + "num_tokens": 779766029.0, + "step": 20435 + }, + { + "epoch": 2.599669253275665, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9018973112106323, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.849390983581543, + "num_tokens": 779811065.0, + "step": 20436 + }, + { + "epoch": 2.599796463554255, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8679641485214233, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8516764044761658, + "num_tokens": 779852442.0, + "step": 20437 + }, + { + "epoch": 2.599923673832846, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.007443428039551, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8699713945388794, + "num_tokens": 779885993.0, + "step": 20438 + }, + { + "epoch": 2.600050884111436, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.048241138458252, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8685808777809143, + "num_tokens": 779927791.0, + "step": 20439 + }, + { + "epoch": 2.6001780943900266, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8761849403381348, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8679810762405396, + "num_tokens": 779961517.0, + "step": 20440 + }, + { + "epoch": 2.600305304668617, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1352527141571045, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8530421257019043, + "num_tokens": 779992902.0, + "step": 20441 + }, + { + "epoch": 2.6004325149472076, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9309473037719727, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8664992451667786, + "num_tokens": 780032849.0, + "step": 20442 + }, + { + "epoch": 2.600559725225798, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8531986474990845, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8771619200706482, + "num_tokens": 780072563.0, + "step": 20443 + }, + { + "epoch": 2.6006869355043887, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.795966625213623, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8651906847953796, + "num_tokens": 780115299.0, + "step": 20444 + }, + { + "epoch": 2.600814145782979, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7896391153335571, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8638744950294495, + "num_tokens": 780155792.0, + "step": 20445 + }, + { + "epoch": 2.6009413560615697, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9881641864776611, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8594892621040344, + "num_tokens": 780189040.0, + "step": 20446 + }, + { + "epoch": 2.6010685663401603, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9483163356781006, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8645206093788147, + "num_tokens": 780229463.0, + "step": 20447 + }, + { + "epoch": 2.601195776618751, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9131567478179932, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8766149282455444, + "num_tokens": 780265961.0, + "step": 20448 + }, + { + "epoch": 2.6013229868973413, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.921769380569458, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.859783411026001, + "num_tokens": 780303220.0, + "step": 20449 + }, + { + "epoch": 2.601450197175932, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9269267320632935, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8621302247047424, + "num_tokens": 780340456.0, + "step": 20450 + }, + { + "epoch": 2.6015774074545224, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9272159337997437, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8624784350395203, + "num_tokens": 780381402.0, + "step": 20451 + }, + { + "epoch": 2.601704617733113, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.790193796157837, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8692367672920227, + "num_tokens": 780422571.0, + "step": 20452 + }, + { + "epoch": 2.6018318280117034, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9816170930862427, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8607426881790161, + "num_tokens": 780462447.0, + "step": 20453 + }, + { + "epoch": 2.601959038290294, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8584175109863281, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8653823733329773, + "num_tokens": 780499183.0, + "step": 20454 + }, + { + "epoch": 2.6020862485688845, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8877639770507812, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.852293848991394, + "num_tokens": 780539216.0, + "step": 20455 + }, + { + "epoch": 2.602213458847475, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8761335611343384, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8726733326911926, + "num_tokens": 780578725.0, + "step": 20456 + }, + { + "epoch": 2.6023406691260655, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9699318408966064, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8659443259239197, + "num_tokens": 780615175.0, + "step": 20457 + }, + { + "epoch": 2.6024678794046556, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9498199224472046, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8679177761077881, + "num_tokens": 780651186.0, + "step": 20458 + }, + { + "epoch": 2.6025950896832466, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8782397508621216, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8589880466461182, + "num_tokens": 780685550.0, + "step": 20459 + }, + { + "epoch": 2.6027222999618367, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8583639860153198, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8865832090377808, + "num_tokens": 780725477.0, + "step": 20460 + }, + { + "epoch": 2.6028495102404277, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.935696005821228, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8688479065895081, + "num_tokens": 780762481.0, + "step": 20461 + }, + { + "epoch": 2.6029767205190177, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9366264343261719, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8457396626472473, + "num_tokens": 780800305.0, + "step": 20462 + }, + { + "epoch": 2.6031039307976087, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8727909326553345, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8673102855682373, + "num_tokens": 780836271.0, + "step": 20463 + }, + { + "epoch": 2.603231141076199, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7050178050994873, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8771963119506836, + "num_tokens": 780877363.0, + "step": 20464 + }, + { + "epoch": 2.6033583513547893, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7493746280670166, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8526220321655273, + "num_tokens": 780922982.0, + "step": 20465 + }, + { + "epoch": 2.60348556163338, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9250047206878662, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.861824631690979, + "num_tokens": 780957925.0, + "step": 20466 + }, + { + "epoch": 2.6036127719119704, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9063339233398438, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8470399975776672, + "num_tokens": 780996922.0, + "step": 20467 + }, + { + "epoch": 2.603739982190561, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0597610473632812, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8483865261077881, + "num_tokens": 781037209.0, + "step": 20468 + }, + { + "epoch": 2.6038671924691514, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8599088191986084, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8582718372344971, + "num_tokens": 781078077.0, + "step": 20469 + }, + { + "epoch": 2.603994402747742, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9378093481063843, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8641362190246582, + "num_tokens": 781122516.0, + "step": 20470 + }, + { + "epoch": 2.6041216130263325, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.36997652053833, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8580225706100464, + "num_tokens": 781156290.0, + "step": 20471 + }, + { + "epoch": 2.604248823304923, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8901702165603638, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8716427683830261, + "num_tokens": 781192769.0, + "step": 20472 + }, + { + "epoch": 2.6043760335835135, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0766117572784424, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8699410557746887, + "num_tokens": 781227594.0, + "step": 20473 + }, + { + "epoch": 2.604503243862104, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0868494510650635, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8761460781097412, + "num_tokens": 781259369.0, + "step": 20474 + }, + { + "epoch": 2.6046304541406946, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0384483337402344, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.863271951675415, + "num_tokens": 781296788.0, + "step": 20475 + }, + { + "epoch": 2.604757664419285, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8509413003921509, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8675662279129028, + "num_tokens": 781333103.0, + "step": 20476 + }, + { + "epoch": 2.6048848746978757, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6988766193389893, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8743475675582886, + "num_tokens": 781372691.0, + "step": 20477 + }, + { + "epoch": 2.605012084976466, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9234817028045654, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8608529567718506, + "num_tokens": 781410480.0, + "step": 20478 + }, + { + "epoch": 2.6051392952550567, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8838502168655396, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8566182851791382, + "num_tokens": 781454110.0, + "step": 20479 + }, + { + "epoch": 2.6052665055336472, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9795585870742798, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8629385828971863, + "num_tokens": 781494034.0, + "step": 20480 + }, + { + "epoch": 2.6053937158122378, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9457448720932007, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8624170422554016, + "num_tokens": 781536225.0, + "step": 20481 + }, + { + "epoch": 2.6055209260908283, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0380914211273193, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8704859018325806, + "num_tokens": 781573987.0, + "step": 20482 + }, + { + "epoch": 2.6056481363694184, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9456442594528198, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8687728047370911, + "num_tokens": 781608987.0, + "step": 20483 + }, + { + "epoch": 2.6057753466480094, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9026217460632324, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8713822364807129, + "num_tokens": 781646185.0, + "step": 20484 + }, + { + "epoch": 2.6059025569265994, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9604839086532593, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8534376621246338, + "num_tokens": 781682999.0, + "step": 20485 + }, + { + "epoch": 2.6060297672051904, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.84132981300354, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8624233603477478, + "num_tokens": 781726052.0, + "step": 20486 + }, + { + "epoch": 2.6061569774837805, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0782408714294434, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8781743049621582, + "num_tokens": 781760562.0, + "step": 20487 + }, + { + "epoch": 2.6062841877623715, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0812883377075195, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8527704477310181, + "num_tokens": 781803765.0, + "step": 20488 + }, + { + "epoch": 2.6064113980409616, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.018397808074951, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8649284839630127, + "num_tokens": 781845419.0, + "step": 20489 + }, + { + "epoch": 2.606538608319552, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9042689800262451, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.873706042766571, + "num_tokens": 781883371.0, + "step": 20490 + }, + { + "epoch": 2.6066658185981426, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.861862301826477, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.862944483757019, + "num_tokens": 781919669.0, + "step": 20491 + }, + { + "epoch": 2.606793028876733, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.954054594039917, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8707438707351685, + "num_tokens": 781953212.0, + "step": 20492 + }, + { + "epoch": 2.6069202391553237, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7554792165756226, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8684005737304688, + "num_tokens": 781996824.0, + "step": 20493 + }, + { + "epoch": 2.607047449433914, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8255847692489624, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8841268420219421, + "num_tokens": 782035350.0, + "step": 20494 + }, + { + "epoch": 2.6071746597125047, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9291635751724243, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.858598530292511, + "num_tokens": 782073217.0, + "step": 20495 + }, + { + "epoch": 2.6073018699910953, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.035330057144165, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.858128547668457, + "num_tokens": 782110777.0, + "step": 20496 + }, + { + "epoch": 2.607429080269686, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.785891056060791, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8589804768562317, + "num_tokens": 782153300.0, + "step": 20497 + }, + { + "epoch": 2.6075562905482763, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.853833794593811, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8696123361587524, + "num_tokens": 782191945.0, + "step": 20498 + }, + { + "epoch": 2.607683500826867, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3603808879852295, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.875887393951416, + "num_tokens": 782230472.0, + "step": 20499 + }, + { + "epoch": 2.6078107111054574, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3327362537384033, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.854952335357666, + "num_tokens": 782263868.0, + "step": 20500 + }, + { + "epoch": 2.607937921384048, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.870699644088745, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8674488067626953, + "num_tokens": 782299781.0, + "step": 20501 + }, + { + "epoch": 2.6080651316626384, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9904828071594238, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.857105016708374, + "num_tokens": 782343376.0, + "step": 20502 + }, + { + "epoch": 2.608192341941229, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7722735404968262, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8773373365402222, + "num_tokens": 782389894.0, + "step": 20503 + }, + { + "epoch": 2.6083195522198195, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8713690042495728, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8673123121261597, + "num_tokens": 782430055.0, + "step": 20504 + }, + { + "epoch": 2.60844676249841, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8516613245010376, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.866486132144928, + "num_tokens": 782467211.0, + "step": 20505 + }, + { + "epoch": 2.6085739727770005, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9440078735351562, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8580178022384644, + "num_tokens": 782505913.0, + "step": 20506 + }, + { + "epoch": 2.608701183055591, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9685115814208984, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8652563691139221, + "num_tokens": 782541858.0, + "step": 20507 + }, + { + "epoch": 2.608828393334181, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.584852457046509, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8637986183166504, + "num_tokens": 782575854.0, + "step": 20508 + }, + { + "epoch": 2.608955603612772, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7622432708740234, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.867723822593689, + "num_tokens": 782614005.0, + "step": 20509 + }, + { + "epoch": 2.609082813891362, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8809618949890137, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8735275268554688, + "num_tokens": 782650147.0, + "step": 20510 + }, + { + "epoch": 2.609210024169953, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.238327741622925, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8658156991004944, + "num_tokens": 782683215.0, + "step": 20511 + }, + { + "epoch": 2.6093372344485433, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9461448192596436, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8715847730636597, + "num_tokens": 782721184.0, + "step": 20512 + }, + { + "epoch": 2.609464444727134, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9653013944625854, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8601354360580444, + "num_tokens": 782759891.0, + "step": 20513 + }, + { + "epoch": 2.6095916550057243, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.604451894760132, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8553094863891602, + "num_tokens": 782796398.0, + "step": 20514 + }, + { + "epoch": 2.609718865284315, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9798210859298706, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8742325305938721, + "num_tokens": 782835968.0, + "step": 20515 + }, + { + "epoch": 2.6098460755629054, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.934372901916504, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8759291768074036, + "num_tokens": 782871367.0, + "step": 20516 + }, + { + "epoch": 2.609973285841496, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8613171577453613, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8870639204978943, + "num_tokens": 782905766.0, + "step": 20517 + }, + { + "epoch": 2.6101004961200864, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8045254945755005, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8703732490539551, + "num_tokens": 782942256.0, + "step": 20518 + }, + { + "epoch": 2.610227706398677, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.021368980407715, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8761581778526306, + "num_tokens": 782976156.0, + "step": 20519 + }, + { + "epoch": 2.6103549166772675, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9744772911071777, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8747251033782959, + "num_tokens": 783011450.0, + "step": 20520 + }, + { + "epoch": 2.610482126955858, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.830644965171814, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8708916902542114, + "num_tokens": 783050355.0, + "step": 20521 + }, + { + "epoch": 2.6106093372344485, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7588858604431152, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8761696815490723, + "num_tokens": 783091621.0, + "step": 20522 + }, + { + "epoch": 2.610736547513039, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7983040809631348, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8556399941444397, + "num_tokens": 783130471.0, + "step": 20523 + }, + { + "epoch": 2.6108637577916296, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7773247957229614, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8695050477981567, + "num_tokens": 783168340.0, + "step": 20524 + }, + { + "epoch": 2.61099096807022, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.127063035964966, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8473159074783325, + "num_tokens": 783207279.0, + "step": 20525 + }, + { + "epoch": 2.6111181783488107, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9296479225158691, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8588742017745972, + "num_tokens": 783243010.0, + "step": 20526 + }, + { + "epoch": 2.611245388627401, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3642284870147705, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8560430407524109, + "num_tokens": 783287548.0, + "step": 20527 + }, + { + "epoch": 2.6113725989059917, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8246822357177734, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8620493412017822, + "num_tokens": 783331549.0, + "step": 20528 + }, + { + "epoch": 2.6114998091845822, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8396036624908447, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8767445683479309, + "num_tokens": 783369445.0, + "step": 20529 + }, + { + "epoch": 2.6116270194631728, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8459115028381348, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8601946830749512, + "num_tokens": 783413218.0, + "step": 20530 + }, + { + "epoch": 2.6117542297417633, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8047748804092407, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8646693229675293, + "num_tokens": 783449944.0, + "step": 20531 + }, + { + "epoch": 2.611881440020354, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1933698654174805, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8657853007316589, + "num_tokens": 783486142.0, + "step": 20532 + }, + { + "epoch": 2.612008650298944, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0268969535827637, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8716036081314087, + "num_tokens": 783521170.0, + "step": 20533 + }, + { + "epoch": 2.612135860577535, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9704114198684692, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8627089262008667, + "num_tokens": 783563902.0, + "step": 20534 + }, + { + "epoch": 2.612263070856125, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8220524787902832, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8706598281860352, + "num_tokens": 783597655.0, + "step": 20535 + }, + { + "epoch": 2.612390281134716, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8365139961242676, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8561772108078003, + "num_tokens": 783635233.0, + "step": 20536 + }, + { + "epoch": 2.612517491413306, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1644039154052734, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8665688037872314, + "num_tokens": 783679677.0, + "step": 20537 + }, + { + "epoch": 2.6126447016918966, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8215445280075073, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8700065612792969, + "num_tokens": 783720608.0, + "step": 20538 + }, + { + "epoch": 2.612771911970487, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8796049356460571, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8752943873405457, + "num_tokens": 783753198.0, + "step": 20539 + }, + { + "epoch": 2.6128991222490776, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8321951627731323, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8645623922348022, + "num_tokens": 783791068.0, + "step": 20540 + }, + { + "epoch": 2.613026332527668, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8136355876922607, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8669750690460205, + "num_tokens": 783831319.0, + "step": 20541 + }, + { + "epoch": 2.6131535428062587, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9406464099884033, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8691215515136719, + "num_tokens": 783867537.0, + "step": 20542 + }, + { + "epoch": 2.613280753084849, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.497514247894287, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8590505719184875, + "num_tokens": 783907787.0, + "step": 20543 + }, + { + "epoch": 2.6134079633634397, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9469441175460815, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8589614629745483, + "num_tokens": 783948558.0, + "step": 20544 + }, + { + "epoch": 2.6135351736420303, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8527216911315918, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8843604326248169, + "num_tokens": 783986726.0, + "step": 20545 + }, + { + "epoch": 2.613662383920621, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.896818995475769, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8465420603752136, + "num_tokens": 784023449.0, + "step": 20546 + }, + { + "epoch": 2.6137895941992113, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8886010646820068, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8830469846725464, + "num_tokens": 784059529.0, + "step": 20547 + }, + { + "epoch": 2.613916804477802, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8955024480819702, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8667008876800537, + "num_tokens": 784103530.0, + "step": 20548 + }, + { + "epoch": 2.6140440147563924, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.917669653892517, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8589450120925903, + "num_tokens": 784141721.0, + "step": 20549 + }, + { + "epoch": 2.614171225034983, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.911478877067566, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8653984665870667, + "num_tokens": 784175868.0, + "step": 20550 + }, + { + "epoch": 2.6142984353135734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8328604698181152, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8780410289764404, + "num_tokens": 784210646.0, + "step": 20551 + }, + { + "epoch": 2.614425645592164, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8407742977142334, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8706150650978088, + "num_tokens": 784255588.0, + "step": 20552 + }, + { + "epoch": 2.6145528558707545, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.925341010093689, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.86760413646698, + "num_tokens": 784292877.0, + "step": 20553 + }, + { + "epoch": 2.614680066149345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7610645294189453, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8701350688934326, + "num_tokens": 784336150.0, + "step": 20554 + }, + { + "epoch": 2.6148072764279355, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8156027793884277, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8802796602249146, + "num_tokens": 784371547.0, + "step": 20555 + }, + { + "epoch": 2.6149344867065256, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9485050439834595, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8685132265090942, + "num_tokens": 784411473.0, + "step": 20556 + }, + { + "epoch": 2.6150616969851166, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8562260866165161, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.856268584728241, + "num_tokens": 784456713.0, + "step": 20557 + }, + { + "epoch": 2.6151889072637067, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8173902034759521, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8806463479995728, + "num_tokens": 784493685.0, + "step": 20558 + }, + { + "epoch": 2.6153161175422976, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6858571767807007, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8777495622634888, + "num_tokens": 784534641.0, + "step": 20559 + }, + { + "epoch": 2.6154433278208877, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8694456815719604, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8439315557479858, + "num_tokens": 784576184.0, + "step": 20560 + }, + { + "epoch": 2.6155705380994787, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8528999090194702, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.868929386138916, + "num_tokens": 784613851.0, + "step": 20561 + }, + { + "epoch": 2.615697748378069, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7496291399002075, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8681758642196655, + "num_tokens": 784654133.0, + "step": 20562 + }, + { + "epoch": 2.6158249586566593, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9886493682861328, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8681765794754028, + "num_tokens": 784690529.0, + "step": 20563 + }, + { + "epoch": 2.61595216893525, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.779624104499817, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8667249083518982, + "num_tokens": 784731972.0, + "step": 20564 + }, + { + "epoch": 2.6160793792138404, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8398877382278442, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8616167306900024, + "num_tokens": 784773275.0, + "step": 20565 + }, + { + "epoch": 2.616206589492431, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9258173704147339, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8498269319534302, + "num_tokens": 784811468.0, + "step": 20566 + }, + { + "epoch": 2.6163337997710214, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7813153266906738, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8683594465255737, + "num_tokens": 784850611.0, + "step": 20567 + }, + { + "epoch": 2.616461010049612, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1777267456054688, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8728636503219604, + "num_tokens": 784886502.0, + "step": 20568 + }, + { + "epoch": 2.6165882203282025, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.203829288482666, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.857154130935669, + "num_tokens": 784924489.0, + "step": 20569 + }, + { + "epoch": 2.616715430606793, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.058199405670166, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8714959621429443, + "num_tokens": 784957224.0, + "step": 20570 + }, + { + "epoch": 2.6168426408853835, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9327622652053833, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8413103222846985, + "num_tokens": 784997754.0, + "step": 20571 + }, + { + "epoch": 2.616969851163974, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8259856700897217, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8513741493225098, + "num_tokens": 785033932.0, + "step": 20572 + }, + { + "epoch": 2.6170970614425646, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8884897232055664, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8640218377113342, + "num_tokens": 785073190.0, + "step": 20573 + }, + { + "epoch": 2.617224271721155, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1486306190490723, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8824621438980103, + "num_tokens": 785108397.0, + "step": 20574 + }, + { + "epoch": 2.6173514819997457, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1143789291381836, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8933973908424377, + "num_tokens": 785142671.0, + "step": 20575 + }, + { + "epoch": 2.617478692278336, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9271090030670166, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8643587231636047, + "num_tokens": 785183467.0, + "step": 20576 + }, + { + "epoch": 2.6176059025569267, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8011690378189087, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8558825254440308, + "num_tokens": 785222072.0, + "step": 20577 + }, + { + "epoch": 2.6177331128355172, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1947832107543945, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8553064465522766, + "num_tokens": 785260458.0, + "step": 20578 + }, + { + "epoch": 2.6178603231141078, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.811554193496704, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8525680303573608, + "num_tokens": 785302434.0, + "step": 20579 + }, + { + "epoch": 2.6179875333926983, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.217564105987549, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8514738082885742, + "num_tokens": 785338251.0, + "step": 20580 + }, + { + "epoch": 2.6181147436712884, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7929213047027588, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8626499176025391, + "num_tokens": 785384789.0, + "step": 20581 + }, + { + "epoch": 2.6182419539498794, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7593528032302856, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8653267621994019, + "num_tokens": 785424978.0, + "step": 20582 + }, + { + "epoch": 2.6183691642284694, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.893078327178955, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8597725629806519, + "num_tokens": 785464421.0, + "step": 20583 + }, + { + "epoch": 2.6184963745070604, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8384970426559448, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8842281103134155, + "num_tokens": 785496166.0, + "step": 20584 + }, + { + "epoch": 2.6186235847856505, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.035104751586914, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8515145778656006, + "num_tokens": 785528504.0, + "step": 20585 + }, + { + "epoch": 2.6187507950642415, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8461214303970337, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8681645393371582, + "num_tokens": 785570896.0, + "step": 20586 + }, + { + "epoch": 2.6188780053428315, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.792728304862976, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8619935512542725, + "num_tokens": 785612464.0, + "step": 20587 + }, + { + "epoch": 2.619005215621422, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7301368713378906, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8662079572677612, + "num_tokens": 785655500.0, + "step": 20588 + }, + { + "epoch": 2.6191324259000126, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9214086532592773, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8681580424308777, + "num_tokens": 785691186.0, + "step": 20589 + }, + { + "epoch": 2.619259636178603, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8255798816680908, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.880110502243042, + "num_tokens": 785728575.0, + "step": 20590 + }, + { + "epoch": 2.6193868464571937, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8317707777023315, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8629263043403625, + "num_tokens": 785765782.0, + "step": 20591 + }, + { + "epoch": 2.619514056735784, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.058511972427368, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8703810572624207, + "num_tokens": 785800130.0, + "step": 20592 + }, + { + "epoch": 2.6196412670143747, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9803924560546875, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8926581740379333, + "num_tokens": 785836647.0, + "step": 20593 + }, + { + "epoch": 2.6197684772929652, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.92341148853302, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8591094017028809, + "num_tokens": 785873897.0, + "step": 20594 + }, + { + "epoch": 2.6198956875715558, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8482887744903564, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8614965677261353, + "num_tokens": 785914296.0, + "step": 20595 + }, + { + "epoch": 2.6200228978501463, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9389899969100952, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8539249897003174, + "num_tokens": 785951426.0, + "step": 20596 + }, + { + "epoch": 2.620150108128737, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7971104383468628, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8696872591972351, + "num_tokens": 785989906.0, + "step": 20597 + }, + { + "epoch": 2.6202773184073274, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 7.737663269042969, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8598402738571167, + "num_tokens": 786024763.0, + "step": 20598 + }, + { + "epoch": 2.620404528685918, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.270657539367676, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8619762659072876, + "num_tokens": 786063055.0, + "step": 20599 + }, + { + "epoch": 2.6205317389645084, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.2750084400177, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8384886980056763, + "num_tokens": 786097675.0, + "step": 20600 + }, + { + "epoch": 2.620658949243099, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 4.269343852996826, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8769261837005615, + "num_tokens": 786133721.0, + "step": 20601 + }, + { + "epoch": 2.6207861595216895, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0147206783294678, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.84779292345047, + "num_tokens": 786179615.0, + "step": 20602 + }, + { + "epoch": 2.62091336980028, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0012333393096924, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.878180742263794, + "num_tokens": 786214945.0, + "step": 20603 + }, + { + "epoch": 2.6210405800788705, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.897181510925293, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8672834634780884, + "num_tokens": 786250880.0, + "step": 20604 + }, + { + "epoch": 2.621167790357461, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8988615274429321, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.866823136806488, + "num_tokens": 786286095.0, + "step": 20605 + }, + { + "epoch": 2.621295000636051, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8051308393478394, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8708772659301758, + "num_tokens": 786319547.0, + "step": 20606 + }, + { + "epoch": 2.621422210914642, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.945045828819275, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8647963404655457, + "num_tokens": 786358699.0, + "step": 20607 + }, + { + "epoch": 2.621549421193232, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8136502504348755, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8699092864990234, + "num_tokens": 786399732.0, + "step": 20608 + }, + { + "epoch": 2.621676631471823, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9537975788116455, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8566285967826843, + "num_tokens": 786437375.0, + "step": 20609 + }, + { + "epoch": 2.6218038417504133, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8608216047286987, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8554230332374573, + "num_tokens": 786475538.0, + "step": 20610 + }, + { + "epoch": 2.621931052029004, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.765769600868225, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8677460551261902, + "num_tokens": 786516802.0, + "step": 20611 + }, + { + "epoch": 2.6220582623075943, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8287715911865234, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.86579430103302, + "num_tokens": 786558470.0, + "step": 20612 + }, + { + "epoch": 2.622185472586185, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.79081392288208, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8598204851150513, + "num_tokens": 786601163.0, + "step": 20613 + }, + { + "epoch": 2.6223126828647754, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9591262340545654, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8724546432495117, + "num_tokens": 786636741.0, + "step": 20614 + }, + { + "epoch": 2.622439893143366, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8962029218673706, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8532167673110962, + "num_tokens": 786672931.0, + "step": 20615 + }, + { + "epoch": 2.6225671034219564, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9097286462783813, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8778160810470581, + "num_tokens": 786709310.0, + "step": 20616 + }, + { + "epoch": 2.622694313700547, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.933883547782898, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.871269941329956, + "num_tokens": 786747113.0, + "step": 20617 + }, + { + "epoch": 2.6228215239791375, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0701820850372314, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.865199863910675, + "num_tokens": 786782033.0, + "step": 20618 + }, + { + "epoch": 2.622948734257728, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8529014587402344, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8663042187690735, + "num_tokens": 786827229.0, + "step": 20619 + }, + { + "epoch": 2.6230759445363185, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9991655349731445, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8746489882469177, + "num_tokens": 786861722.0, + "step": 20620 + }, + { + "epoch": 2.623203154814909, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9550329446792603, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8691388368606567, + "num_tokens": 786896202.0, + "step": 20621 + }, + { + "epoch": 2.6233303650934996, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8644384145736694, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8679851293563843, + "num_tokens": 786932154.0, + "step": 20622 + }, + { + "epoch": 2.62345757537209, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 7.786972999572754, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8570377230644226, + "num_tokens": 786972184.0, + "step": 20623 + }, + { + "epoch": 2.6235847856506807, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.925430178642273, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8593403697013855, + "num_tokens": 787015514.0, + "step": 20624 + }, + { + "epoch": 2.623711995929271, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9906744956970215, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8644943237304688, + "num_tokens": 787054023.0, + "step": 20625 + }, + { + "epoch": 2.6238392062078617, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.129099130630493, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8616433143615723, + "num_tokens": 787088858.0, + "step": 20626 + }, + { + "epoch": 2.6239664164864522, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.037935733795166, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.86795973777771, + "num_tokens": 787124805.0, + "step": 20627 + }, + { + "epoch": 2.6240936267650428, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7795733213424683, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8793439865112305, + "num_tokens": 787164922.0, + "step": 20628 + }, + { + "epoch": 2.6242208370436333, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.826598882675171, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8760279417037964, + "num_tokens": 787204936.0, + "step": 20629 + }, + { + "epoch": 2.624348047322224, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3995864391326904, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8780306577682495, + "num_tokens": 787238292.0, + "step": 20630 + }, + { + "epoch": 2.624475257600814, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.907529354095459, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8860616683959961, + "num_tokens": 787269054.0, + "step": 20631 + }, + { + "epoch": 2.624602467879405, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8073102235794067, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8639416694641113, + "num_tokens": 787310022.0, + "step": 20632 + }, + { + "epoch": 2.624729678157995, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9656754732131958, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8557745218276978, + "num_tokens": 787350917.0, + "step": 20633 + }, + { + "epoch": 2.624856888436586, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7555302381515503, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8744515776634216, + "num_tokens": 787388187.0, + "step": 20634 + }, + { + "epoch": 2.624984098715176, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8205498456954956, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.871864914894104, + "num_tokens": 787424998.0, + "step": 20635 + }, + { + "epoch": 2.6251113089937665, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.781147837638855, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8774513006210327, + "num_tokens": 787465456.0, + "step": 20636 + }, + { + "epoch": 2.625238519272357, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8442742824554443, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8643798828125, + "num_tokens": 787504797.0, + "step": 20637 + }, + { + "epoch": 2.6253657295509476, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9980719089508057, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8767947554588318, + "num_tokens": 787542911.0, + "step": 20638 + }, + { + "epoch": 2.625492939829538, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 80.52388000488281, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8557869791984558, + "num_tokens": 787576143.0, + "step": 20639 + }, + { + "epoch": 2.6256201501081287, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9291324615478516, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8670334219932556, + "num_tokens": 787619912.0, + "step": 20640 + }, + { + "epoch": 2.625747360386719, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8576221466064453, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8550534248352051, + "num_tokens": 787658181.0, + "step": 20641 + }, + { + "epoch": 2.6258745706653097, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9489860534667969, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8558628559112549, + "num_tokens": 787694889.0, + "step": 20642 + }, + { + "epoch": 2.6260017809439002, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.795377492904663, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8806606531143188, + "num_tokens": 787737633.0, + "step": 20643 + }, + { + "epoch": 2.6261289912224908, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.76408052444458, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8547953367233276, + "num_tokens": 787779069.0, + "step": 20644 + }, + { + "epoch": 2.6262562015010813, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9378093481063843, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8521850109100342, + "num_tokens": 787815544.0, + "step": 20645 + }, + { + "epoch": 2.626383411779672, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.885827898979187, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8605343699455261, + "num_tokens": 787852684.0, + "step": 20646 + }, + { + "epoch": 2.6265106220582624, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6656379699707031, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8623062968254089, + "num_tokens": 787893452.0, + "step": 20647 + }, + { + "epoch": 2.626637832336853, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8677690029144287, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8661684989929199, + "num_tokens": 787929430.0, + "step": 20648 + }, + { + "epoch": 2.6267650426154434, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7501986026763916, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8632382154464722, + "num_tokens": 787969768.0, + "step": 20649 + }, + { + "epoch": 2.626892252894034, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9814951419830322, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8668308854103088, + "num_tokens": 788009605.0, + "step": 20650 + }, + { + "epoch": 2.6270194631726245, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8460681438446045, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8777817487716675, + "num_tokens": 788044652.0, + "step": 20651 + }, + { + "epoch": 2.627146673451215, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9891738891601562, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.838647723197937, + "num_tokens": 788085981.0, + "step": 20652 + }, + { + "epoch": 2.6272738837298055, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7863951921463013, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.863256573677063, + "num_tokens": 788132196.0, + "step": 20653 + }, + { + "epoch": 2.6274010940083956, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0227367877960205, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8776102066040039, + "num_tokens": 788160099.0, + "step": 20654 + }, + { + "epoch": 2.6275283042869866, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9697474241256714, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8632490634918213, + "num_tokens": 788197264.0, + "step": 20655 + }, + { + "epoch": 2.6276555145655767, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9348063468933105, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8767822980880737, + "num_tokens": 788232900.0, + "step": 20656 + }, + { + "epoch": 2.6277827248441676, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8969491720199585, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8607591390609741, + "num_tokens": 788273720.0, + "step": 20657 + }, + { + "epoch": 2.6279099351227577, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.08027720451355, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8677152395248413, + "num_tokens": 788308425.0, + "step": 20658 + }, + { + "epoch": 2.6280371454013487, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9026165008544922, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8564037084579468, + "num_tokens": 788343701.0, + "step": 20659 + }, + { + "epoch": 2.628164355679939, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1422765254974365, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8506953120231628, + "num_tokens": 788372789.0, + "step": 20660 + }, + { + "epoch": 2.6282915659585293, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0165960788726807, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8704317808151245, + "num_tokens": 788408422.0, + "step": 20661 + }, + { + "epoch": 2.62841877623712, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8733248710632324, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8688457608222961, + "num_tokens": 788446113.0, + "step": 20662 + }, + { + "epoch": 2.6285459865157104, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8124538660049438, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8593937158584595, + "num_tokens": 788485009.0, + "step": 20663 + }, + { + "epoch": 2.628673196794301, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8307889699935913, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8688108921051025, + "num_tokens": 788523446.0, + "step": 20664 + }, + { + "epoch": 2.6288004070728914, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9686070680618286, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8751662969589233, + "num_tokens": 788557423.0, + "step": 20665 + }, + { + "epoch": 2.628927617351482, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.331197500228882, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8610237836837769, + "num_tokens": 788588177.0, + "step": 20666 + }, + { + "epoch": 2.6290548276300725, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9578825235366821, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8663147687911987, + "num_tokens": 788624877.0, + "step": 20667 + }, + { + "epoch": 2.629182037908663, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1066126823425293, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8753973245620728, + "num_tokens": 788662532.0, + "step": 20668 + }, + { + "epoch": 2.6293092481872535, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8868180513381958, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8657955527305603, + "num_tokens": 788702734.0, + "step": 20669 + }, + { + "epoch": 2.629436458465844, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.902077555656433, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8650660514831543, + "num_tokens": 788738041.0, + "step": 20670 + }, + { + "epoch": 2.6295636687444346, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0205318927764893, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8650137782096863, + "num_tokens": 788772998.0, + "step": 20671 + }, + { + "epoch": 2.629690879023025, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8185958862304688, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8497369289398193, + "num_tokens": 788814310.0, + "step": 20672 + }, + { + "epoch": 2.6298180893016156, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.771763801574707, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8694249987602234, + "num_tokens": 788852811.0, + "step": 20673 + }, + { + "epoch": 2.629945299580206, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.023852825164795, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8735411167144775, + "num_tokens": 788888756.0, + "step": 20674 + }, + { + "epoch": 2.6300725098587967, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8843475580215454, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8699841499328613, + "num_tokens": 788926292.0, + "step": 20675 + }, + { + "epoch": 2.6301997201373872, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8464365005493164, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8759639263153076, + "num_tokens": 788965625.0, + "step": 20676 + }, + { + "epoch": 2.6303269304159778, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4360921382904053, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8715395927429199, + "num_tokens": 789007390.0, + "step": 20677 + }, + { + "epoch": 2.6304541406945683, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0505547523498535, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.868977963924408, + "num_tokens": 789047940.0, + "step": 20678 + }, + { + "epoch": 2.6305813509731584, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9654264450073242, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8656362295150757, + "num_tokens": 789086321.0, + "step": 20679 + }, + { + "epoch": 2.6307085612517493, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.036202907562256, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.879188597202301, + "num_tokens": 789121757.0, + "step": 20680 + }, + { + "epoch": 2.6308357715303394, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.794053077697754, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8801051378250122, + "num_tokens": 789160076.0, + "step": 20681 + }, + { + "epoch": 2.6309629818089304, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9552479982376099, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8574905395507812, + "num_tokens": 789197080.0, + "step": 20682 + }, + { + "epoch": 2.6310901920875205, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7780729532241821, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8607595562934875, + "num_tokens": 789249876.0, + "step": 20683 + }, + { + "epoch": 2.6312174023661115, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7906501293182373, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8648765683174133, + "num_tokens": 789290146.0, + "step": 20684 + }, + { + "epoch": 2.6313446126447015, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9348112344741821, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.869880735874176, + "num_tokens": 789335610.0, + "step": 20685 + }, + { + "epoch": 2.631471822923292, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0146842002868652, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8606652617454529, + "num_tokens": 789371956.0, + "step": 20686 + }, + { + "epoch": 2.6315990332018826, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.93007230758667, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8626289367675781, + "num_tokens": 789415257.0, + "step": 20687 + }, + { + "epoch": 2.631726243480473, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8496054410934448, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.866489052772522, + "num_tokens": 789452533.0, + "step": 20688 + }, + { + "epoch": 2.6318534537590637, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9771885871887207, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8620935678482056, + "num_tokens": 789488780.0, + "step": 20689 + }, + { + "epoch": 2.631980664037654, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 16.616418838500977, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8628937005996704, + "num_tokens": 789528169.0, + "step": 20690 + }, + { + "epoch": 2.6321078743162447, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.066197633743286, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8583046197891235, + "num_tokens": 789568382.0, + "step": 20691 + }, + { + "epoch": 2.6322350845948352, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9081851243972778, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8622255325317383, + "num_tokens": 789606895.0, + "step": 20692 + }, + { + "epoch": 2.6323622948734258, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9104363918304443, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8500560522079468, + "num_tokens": 789649106.0, + "step": 20693 + }, + { + "epoch": 2.6324895051520163, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7729743719100952, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.869726300239563, + "num_tokens": 789689449.0, + "step": 20694 + }, + { + "epoch": 2.632616715430607, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8527820110321045, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8767943382263184, + "num_tokens": 789726179.0, + "step": 20695 + }, + { + "epoch": 2.6327439257091974, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8018772602081299, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8479666709899902, + "num_tokens": 789764858.0, + "step": 20696 + }, + { + "epoch": 2.632871135987788, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1746015548706055, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8527315855026245, + "num_tokens": 789799196.0, + "step": 20697 + }, + { + "epoch": 2.6329983462663784, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.790295958518982, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8646230697631836, + "num_tokens": 789840498.0, + "step": 20698 + }, + { + "epoch": 2.633125556544969, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9633845090866089, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8696141242980957, + "num_tokens": 789874269.0, + "step": 20699 + }, + { + "epoch": 2.6332527668235595, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.772179126739502, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8668704628944397, + "num_tokens": 789914938.0, + "step": 20700 + }, + { + "epoch": 2.63337997710215, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6910982131958008, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8667998313903809, + "num_tokens": 789959219.0, + "step": 20701 + }, + { + "epoch": 2.6335071873807405, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9494966268539429, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.859056830406189, + "num_tokens": 789993707.0, + "step": 20702 + }, + { + "epoch": 2.633634397659331, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.973191261291504, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8465051651000977, + "num_tokens": 790034285.0, + "step": 20703 + }, + { + "epoch": 2.633761607937921, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8552082777023315, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8694887161254883, + "num_tokens": 790069714.0, + "step": 20704 + }, + { + "epoch": 2.633888818216512, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8398929834365845, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8650859594345093, + "num_tokens": 790110733.0, + "step": 20705 + }, + { + "epoch": 2.634016028495102, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.882401943206787, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8698999285697937, + "num_tokens": 790144468.0, + "step": 20706 + }, + { + "epoch": 2.634143238773693, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1869874000549316, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8631405830383301, + "num_tokens": 790186881.0, + "step": 20707 + }, + { + "epoch": 2.6342704490522832, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9888359308242798, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8718674778938293, + "num_tokens": 790225083.0, + "step": 20708 + }, + { + "epoch": 2.6343976593308738, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8076854944229126, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8656288981437683, + "num_tokens": 790266822.0, + "step": 20709 + }, + { + "epoch": 2.6345248696094643, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9612399339675903, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8621982336044312, + "num_tokens": 790305948.0, + "step": 20710 + }, + { + "epoch": 2.634652079888055, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.11600923538208, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8674297332763672, + "num_tokens": 790341639.0, + "step": 20711 + }, + { + "epoch": 2.6347792901666454, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9982869625091553, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8748704195022583, + "num_tokens": 790377442.0, + "step": 20712 + }, + { + "epoch": 2.634906500445236, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8080791234970093, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8631568551063538, + "num_tokens": 790414911.0, + "step": 20713 + }, + { + "epoch": 2.6350337107238264, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9028613567352295, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8644170761108398, + "num_tokens": 790453968.0, + "step": 20714 + }, + { + "epoch": 2.635160921002417, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.025768518447876, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.861660361289978, + "num_tokens": 790493216.0, + "step": 20715 + }, + { + "epoch": 2.6352881312810075, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9374479055404663, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8609763383865356, + "num_tokens": 790529380.0, + "step": 20716 + }, + { + "epoch": 2.635415341559598, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.890750765800476, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8606339693069458, + "num_tokens": 790569568.0, + "step": 20717 + }, + { + "epoch": 2.6355425518381885, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0227768421173096, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8704602718353271, + "num_tokens": 790602269.0, + "step": 20718 + }, + { + "epoch": 2.635669762116779, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9658914804458618, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8542671203613281, + "num_tokens": 790639723.0, + "step": 20719 + }, + { + "epoch": 2.6357969723953696, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0066611766815186, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8507790565490723, + "num_tokens": 790676103.0, + "step": 20720 + }, + { + "epoch": 2.63592418267396, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9909422397613525, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8664648532867432, + "num_tokens": 790712118.0, + "step": 20721 + }, + { + "epoch": 2.6360513929525506, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.885249137878418, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8653683066368103, + "num_tokens": 790747079.0, + "step": 20722 + }, + { + "epoch": 2.636178603231141, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9950536489486694, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8562763929367065, + "num_tokens": 790781741.0, + "step": 20723 + }, + { + "epoch": 2.6363058135097317, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8754608631134033, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8646823167800903, + "num_tokens": 790820243.0, + "step": 20724 + }, + { + "epoch": 2.6364330237883222, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9409748315811157, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8803399205207825, + "num_tokens": 790851492.0, + "step": 20725 + }, + { + "epoch": 2.6365602340669128, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.195916175842285, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8703843951225281, + "num_tokens": 790882293.0, + "step": 20726 + }, + { + "epoch": 2.636687444345503, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9699627161026, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8606137037277222, + "num_tokens": 790922188.0, + "step": 20727 + }, + { + "epoch": 2.636814654624094, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9219775199890137, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8743867874145508, + "num_tokens": 790960675.0, + "step": 20728 + }, + { + "epoch": 2.636941864902684, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8592242002487183, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8740445375442505, + "num_tokens": 790999352.0, + "step": 20729 + }, + { + "epoch": 2.637069075181275, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8649338483810425, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8658918142318726, + "num_tokens": 791035898.0, + "step": 20730 + }, + { + "epoch": 2.637196285459865, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.816825270652771, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8416591882705688, + "num_tokens": 791079731.0, + "step": 20731 + }, + { + "epoch": 2.637323495738456, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0040884017944336, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8767228126525879, + "num_tokens": 791118565.0, + "step": 20732 + }, + { + "epoch": 2.637450706017046, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9775056838989258, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.865686297416687, + "num_tokens": 791154876.0, + "step": 20733 + }, + { + "epoch": 2.6375779162956365, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8931430578231812, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8651256561279297, + "num_tokens": 791196003.0, + "step": 20734 + }, + { + "epoch": 2.637705126574227, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.023024797439575, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8645185232162476, + "num_tokens": 791236462.0, + "step": 20735 + }, + { + "epoch": 2.6378323368528176, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0093190670013428, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8523788452148438, + "num_tokens": 791271918.0, + "step": 20736 + }, + { + "epoch": 2.637959547131408, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.938450813293457, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8774105310440063, + "num_tokens": 791309813.0, + "step": 20737 + }, + { + "epoch": 2.6380867574099987, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.178661823272705, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8732866644859314, + "num_tokens": 791338854.0, + "step": 20738 + }, + { + "epoch": 2.638213967688589, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8933959007263184, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.870775580406189, + "num_tokens": 791376580.0, + "step": 20739 + }, + { + "epoch": 2.6383411779671797, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9829517602920532, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8627639412879944, + "num_tokens": 791415931.0, + "step": 20740 + }, + { + "epoch": 2.6384683882457702, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.06770396232605, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8613494038581848, + "num_tokens": 791448201.0, + "step": 20741 + }, + { + "epoch": 2.6385955985243608, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.964821457862854, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8577654361724854, + "num_tokens": 791489183.0, + "step": 20742 + }, + { + "epoch": 2.6387228088029513, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0250377655029297, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.866255521774292, + "num_tokens": 791528414.0, + "step": 20743 + }, + { + "epoch": 2.638850019081542, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9185469150543213, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8842885494232178, + "num_tokens": 791566379.0, + "step": 20744 + }, + { + "epoch": 2.6389772293601323, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9755569696426392, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8668636083602905, + "num_tokens": 791599789.0, + "step": 20745 + }, + { + "epoch": 2.639104439638723, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.105787515640259, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8503944277763367, + "num_tokens": 791642333.0, + "step": 20746 + }, + { + "epoch": 2.6392316499173134, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9174001216888428, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8669293522834778, + "num_tokens": 791687190.0, + "step": 20747 + }, + { + "epoch": 2.639358860195904, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.027069568634033, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.850550651550293, + "num_tokens": 791722806.0, + "step": 20748 + }, + { + "epoch": 2.6394860704744945, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.025242805480957, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8505030274391174, + "num_tokens": 791757726.0, + "step": 20749 + }, + { + "epoch": 2.639613280753085, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.905015468597412, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8727096915245056, + "num_tokens": 791795160.0, + "step": 20750 + }, + { + "epoch": 2.6397404910316755, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9102013111114502, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8781101703643799, + "num_tokens": 791831682.0, + "step": 20751 + }, + { + "epoch": 2.6398677013102656, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.02514910697937, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8764094114303589, + "num_tokens": 791863230.0, + "step": 20752 + }, + { + "epoch": 2.6399949115888566, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.675750494003296, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8800022602081299, + "num_tokens": 791904323.0, + "step": 20753 + }, + { + "epoch": 2.6401221218674467, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7438340187072754, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8580886721611023, + "num_tokens": 791951836.0, + "step": 20754 + }, + { + "epoch": 2.6402493321460376, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8561166524887085, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8686788082122803, + "num_tokens": 791989782.0, + "step": 20755 + }, + { + "epoch": 2.6403765424246277, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9986234903335571, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8657535314559937, + "num_tokens": 792024038.0, + "step": 20756 + }, + { + "epoch": 2.6405037527032187, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8385725021362305, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8663775324821472, + "num_tokens": 792067893.0, + "step": 20757 + }, + { + "epoch": 2.6406309629818088, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.893694519996643, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8745941519737244, + "num_tokens": 792108593.0, + "step": 20758 + }, + { + "epoch": 2.6407581732603993, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.917502999305725, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8780296444892883, + "num_tokens": 792141201.0, + "step": 20759 + }, + { + "epoch": 2.64088538353899, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.925223469734192, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8864749670028687, + "num_tokens": 792182908.0, + "step": 20760 + }, + { + "epoch": 2.6410125938175804, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9485973119735718, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8665106296539307, + "num_tokens": 792225223.0, + "step": 20761 + }, + { + "epoch": 2.641139804096171, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0815532207489014, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8482998609542847, + "num_tokens": 792258131.0, + "step": 20762 + }, + { + "epoch": 2.6412670143747614, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.041032314300537, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8718830347061157, + "num_tokens": 792291429.0, + "step": 20763 + }, + { + "epoch": 2.641394224653352, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9724407196044922, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8615893125534058, + "num_tokens": 792327777.0, + "step": 20764 + }, + { + "epoch": 2.6415214349319425, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8724937438964844, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8687702417373657, + "num_tokens": 792367918.0, + "step": 20765 + }, + { + "epoch": 2.641648645210533, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9688496589660645, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8699127435684204, + "num_tokens": 792405550.0, + "step": 20766 + }, + { + "epoch": 2.6417758554891235, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 4.006625175476074, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8695652484893799, + "num_tokens": 792440948.0, + "step": 20767 + }, + { + "epoch": 2.641903065767714, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7906907796859741, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8733316659927368, + "num_tokens": 792488065.0, + "step": 20768 + }, + { + "epoch": 2.6420302760463046, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.157632827758789, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8723794221878052, + "num_tokens": 792522845.0, + "step": 20769 + }, + { + "epoch": 2.642157486324895, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.020779848098755, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8757783770561218, + "num_tokens": 792561118.0, + "step": 20770 + }, + { + "epoch": 2.6422846966034856, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.910069465637207, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8662182092666626, + "num_tokens": 792600125.0, + "step": 20771 + }, + { + "epoch": 2.642411906882076, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9262325763702393, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8702366948127747, + "num_tokens": 792633856.0, + "step": 20772 + }, + { + "epoch": 2.6425391171606667, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8916012048721313, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8612654209136963, + "num_tokens": 792672427.0, + "step": 20773 + }, + { + "epoch": 2.6426663274392572, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7833040952682495, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8436335325241089, + "num_tokens": 792713856.0, + "step": 20774 + }, + { + "epoch": 2.6427935377178478, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8238581418991089, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8575562834739685, + "num_tokens": 792757602.0, + "step": 20775 + }, + { + "epoch": 2.6429207479964383, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.866060495376587, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8551905155181885, + "num_tokens": 792797067.0, + "step": 20776 + }, + { + "epoch": 2.6430479582750284, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8345645666122437, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8696040511131287, + "num_tokens": 792835748.0, + "step": 20777 + }, + { + "epoch": 2.6431751685536193, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8008308410644531, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8675680160522461, + "num_tokens": 792875039.0, + "step": 20778 + }, + { + "epoch": 2.6433023788322094, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8094319105148315, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8720147609710693, + "num_tokens": 792916580.0, + "step": 20779 + }, + { + "epoch": 2.6434295891108004, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.122823476791382, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8668575882911682, + "num_tokens": 792952507.0, + "step": 20780 + }, + { + "epoch": 2.6435567993893905, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8952610492706299, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8671578764915466, + "num_tokens": 792994046.0, + "step": 20781 + }, + { + "epoch": 2.6436840096679814, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.735234260559082, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8852684497833252, + "num_tokens": 793033855.0, + "step": 20782 + }, + { + "epoch": 2.6438112199465715, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8804141283035278, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8697461485862732, + "num_tokens": 793072226.0, + "step": 20783 + }, + { + "epoch": 2.643938430225162, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9624252319335938, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8691257238388062, + "num_tokens": 793110884.0, + "step": 20784 + }, + { + "epoch": 2.6440656405037526, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8967138528823853, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8478113412857056, + "num_tokens": 793151885.0, + "step": 20785 + }, + { + "epoch": 2.644192850782343, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1008431911468506, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8442268371582031, + "num_tokens": 793187253.0, + "step": 20786 + }, + { + "epoch": 2.6443200610609336, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9442949295043945, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8711321949958801, + "num_tokens": 793231249.0, + "step": 20787 + }, + { + "epoch": 2.644447271339524, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8484559059143066, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8772363066673279, + "num_tokens": 793269918.0, + "step": 20788 + }, + { + "epoch": 2.6445744816181147, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7476831674575806, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8764917850494385, + "num_tokens": 793306856.0, + "step": 20789 + }, + { + "epoch": 2.6447016918967052, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7777572870254517, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8557817935943604, + "num_tokens": 793347866.0, + "step": 20790 + }, + { + "epoch": 2.6448289021752958, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8506048917770386, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8634552955627441, + "num_tokens": 793391037.0, + "step": 20791 + }, + { + "epoch": 2.6449561124538863, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.172856092453003, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8521599769592285, + "num_tokens": 793423560.0, + "step": 20792 + }, + { + "epoch": 2.645083322732477, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8951568603515625, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8719760179519653, + "num_tokens": 793460215.0, + "step": 20793 + }, + { + "epoch": 2.6452105330110673, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0537638664245605, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8525800108909607, + "num_tokens": 793492671.0, + "step": 20794 + }, + { + "epoch": 2.645337743289658, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9884084463119507, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8648009896278381, + "num_tokens": 793529526.0, + "step": 20795 + }, + { + "epoch": 2.6454649535682484, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8839956521987915, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8743906021118164, + "num_tokens": 793566748.0, + "step": 20796 + }, + { + "epoch": 2.645592163846839, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9472209215164185, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8808348178863525, + "num_tokens": 793600703.0, + "step": 20797 + }, + { + "epoch": 2.6457193741254295, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8305695056915283, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8646948933601379, + "num_tokens": 793639226.0, + "step": 20798 + }, + { + "epoch": 2.64584658440402, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9846926927566528, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8794917464256287, + "num_tokens": 793676147.0, + "step": 20799 + }, + { + "epoch": 2.6459737946826105, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.025433301925659, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8769292831420898, + "num_tokens": 793715778.0, + "step": 20800 + }, + { + "epoch": 2.646101004961201, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8592842817306519, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8813512325286865, + "num_tokens": 793752096.0, + "step": 20801 + }, + { + "epoch": 2.646228215239791, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2003257274627686, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8606520891189575, + "num_tokens": 793789809.0, + "step": 20802 + }, + { + "epoch": 2.646355425518382, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2719180583953857, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8628503680229187, + "num_tokens": 793826505.0, + "step": 20803 + }, + { + "epoch": 2.646482635796972, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8605901002883911, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.864359974861145, + "num_tokens": 793868360.0, + "step": 20804 + }, + { + "epoch": 2.646609846075563, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9166760444641113, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8659069538116455, + "num_tokens": 793906566.0, + "step": 20805 + }, + { + "epoch": 2.6467370563541532, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8488667011260986, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8627582788467407, + "num_tokens": 793947814.0, + "step": 20806 + }, + { + "epoch": 2.6468642666327438, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.831757664680481, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8709282875061035, + "num_tokens": 793987391.0, + "step": 20807 + }, + { + "epoch": 2.6469914769113343, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8135387897491455, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8767004609107971, + "num_tokens": 794024045.0, + "step": 20808 + }, + { + "epoch": 2.647118687189925, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0105042457580566, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8703503012657166, + "num_tokens": 794063405.0, + "step": 20809 + }, + { + "epoch": 2.6472458974685154, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2310211658477783, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8418802618980408, + "num_tokens": 794099729.0, + "step": 20810 + }, + { + "epoch": 2.647373107747106, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.971703290939331, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8579578399658203, + "num_tokens": 794139669.0, + "step": 20811 + }, + { + "epoch": 2.6475003180256964, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.841211199760437, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8689125180244446, + "num_tokens": 794176906.0, + "step": 20812 + }, + { + "epoch": 2.647627528304287, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8356677293777466, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8703466653823853, + "num_tokens": 794216594.0, + "step": 20813 + }, + { + "epoch": 2.6477547385828775, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1060116291046143, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8697388768196106, + "num_tokens": 794248379.0, + "step": 20814 + }, + { + "epoch": 2.647881948861468, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9367072582244873, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8417710065841675, + "num_tokens": 794293604.0, + "step": 20815 + }, + { + "epoch": 2.6480091591400585, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.968894600868225, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8581901788711548, + "num_tokens": 794332084.0, + "step": 20816 + }, + { + "epoch": 2.648136369418649, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7844823598861694, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8817284107208252, + "num_tokens": 794369158.0, + "step": 20817 + }, + { + "epoch": 2.6482635796972396, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8848119974136353, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8697842359542847, + "num_tokens": 794410531.0, + "step": 20818 + }, + { + "epoch": 2.64839078997583, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7971091270446777, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8668909668922424, + "num_tokens": 794457556.0, + "step": 20819 + }, + { + "epoch": 2.6485180002544206, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0048861503601074, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8553813099861145, + "num_tokens": 794492737.0, + "step": 20820 + }, + { + "epoch": 2.648645210533011, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7515053749084473, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8769676685333252, + "num_tokens": 794532589.0, + "step": 20821 + }, + { + "epoch": 2.6487724208116017, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.012211322784424, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8551145792007446, + "num_tokens": 794566982.0, + "step": 20822 + }, + { + "epoch": 2.648899631090192, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2626821994781494, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.849199652671814, + "num_tokens": 794604278.0, + "step": 20823 + }, + { + "epoch": 2.6490268413687827, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8300352096557617, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8751544952392578, + "num_tokens": 794643785.0, + "step": 20824 + }, + { + "epoch": 2.649154051647373, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9574217796325684, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8752772808074951, + "num_tokens": 794678302.0, + "step": 20825 + }, + { + "epoch": 2.649281261925964, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9893709421157837, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8573289513587952, + "num_tokens": 794715732.0, + "step": 20826 + }, + { + "epoch": 2.649408472204554, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7357778549194336, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.870916485786438, + "num_tokens": 794754359.0, + "step": 20827 + }, + { + "epoch": 2.649535682483145, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8253848552703857, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8634757995605469, + "num_tokens": 794795770.0, + "step": 20828 + }, + { + "epoch": 2.649662892761735, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.939675211906433, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8587989807128906, + "num_tokens": 794835654.0, + "step": 20829 + }, + { + "epoch": 2.649790103040326, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9073259830474854, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8698344826698303, + "num_tokens": 794872527.0, + "step": 20830 + }, + { + "epoch": 2.649917313318916, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6740775108337402, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8702304363250732, + "num_tokens": 794918577.0, + "step": 20831 + }, + { + "epoch": 2.6500445235975065, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9008255004882812, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8771816492080688, + "num_tokens": 794953566.0, + "step": 20832 + }, + { + "epoch": 2.650171733876097, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9393620491027832, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8737452626228333, + "num_tokens": 794990547.0, + "step": 20833 + }, + { + "epoch": 2.6502989441546876, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7427817583084106, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8838574886322021, + "num_tokens": 795028419.0, + "step": 20834 + }, + { + "epoch": 2.650426154433278, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.950518250465393, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8825867176055908, + "num_tokens": 795064303.0, + "step": 20835 + }, + { + "epoch": 2.6505533647118686, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.931746244430542, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8431624174118042, + "num_tokens": 795104280.0, + "step": 20836 + }, + { + "epoch": 2.650680574990459, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9126797914505005, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8579877614974976, + "num_tokens": 795142547.0, + "step": 20837 + }, + { + "epoch": 2.6508077852690497, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8841921091079712, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8635638356208801, + "num_tokens": 795186193.0, + "step": 20838 + }, + { + "epoch": 2.6509349955476402, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1124281883239746, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8845392465591431, + "num_tokens": 795215453.0, + "step": 20839 + }, + { + "epoch": 2.6510622058262308, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0464370250701904, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8623101711273193, + "num_tokens": 795252820.0, + "step": 20840 + }, + { + "epoch": 2.6511894161048213, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9570139646530151, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8642488121986389, + "num_tokens": 795290000.0, + "step": 20841 + }, + { + "epoch": 2.651316626383412, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8341962099075317, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8829509019851685, + "num_tokens": 795330389.0, + "step": 20842 + }, + { + "epoch": 2.6514438366620023, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3114960193634033, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8615160584449768, + "num_tokens": 795366693.0, + "step": 20843 + }, + { + "epoch": 2.651571046940593, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9045860767364502, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8686721324920654, + "num_tokens": 795405748.0, + "step": 20844 + }, + { + "epoch": 2.6516982572191834, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.5712451934814453, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8780783414840698, + "num_tokens": 795446179.0, + "step": 20845 + }, + { + "epoch": 2.651825467497774, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8634607791900635, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.86263108253479, + "num_tokens": 795488234.0, + "step": 20846 + }, + { + "epoch": 2.6519526777763645, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9540016651153564, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8628826141357422, + "num_tokens": 795526508.0, + "step": 20847 + }, + { + "epoch": 2.652079888054955, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9266762733459473, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8739396333694458, + "num_tokens": 795566299.0, + "step": 20848 + }, + { + "epoch": 2.6522070983335455, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9968563318252563, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8615546822547913, + "num_tokens": 795603892.0, + "step": 20849 + }, + { + "epoch": 2.6523343086121356, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.025590658187866, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8747818470001221, + "num_tokens": 795639622.0, + "step": 20850 + }, + { + "epoch": 2.6524615188907266, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.150292158126831, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8888039588928223, + "num_tokens": 795677299.0, + "step": 20851 + }, + { + "epoch": 2.6525887291693167, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0247349739074707, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8657524585723877, + "num_tokens": 795714304.0, + "step": 20852 + }, + { + "epoch": 2.6527159394479076, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9457998275756836, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.862114667892456, + "num_tokens": 795752330.0, + "step": 20853 + }, + { + "epoch": 2.6528431497264977, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9451923370361328, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8456240296363831, + "num_tokens": 795789503.0, + "step": 20854 + }, + { + "epoch": 2.6529703600050887, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.811547875404358, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8651716709136963, + "num_tokens": 795828449.0, + "step": 20855 + }, + { + "epoch": 2.6530975702836788, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8959400653839111, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8678320050239563, + "num_tokens": 795868454.0, + "step": 20856 + }, + { + "epoch": 2.6532247805622693, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9010387659072876, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8523721694946289, + "num_tokens": 795904629.0, + "step": 20857 + }, + { + "epoch": 2.65335199084086, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8384623527526855, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8537068367004395, + "num_tokens": 795943197.0, + "step": 20858 + }, + { + "epoch": 2.6534792011194503, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7020139694213867, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8625928163528442, + "num_tokens": 795984672.0, + "step": 20859 + }, + { + "epoch": 2.653606411398041, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9322699308395386, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8470412492752075, + "num_tokens": 796022599.0, + "step": 20860 + }, + { + "epoch": 2.6537336216766314, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9046497344970703, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8653490543365479, + "num_tokens": 796057798.0, + "step": 20861 + }, + { + "epoch": 2.653860831955222, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.888380527496338, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8646001815795898, + "num_tokens": 796094928.0, + "step": 20862 + }, + { + "epoch": 2.6539880422338125, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8748626708984375, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.867962121963501, + "num_tokens": 796132614.0, + "step": 20863 + }, + { + "epoch": 2.654115252512403, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0146560668945312, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8643966317176819, + "num_tokens": 796171653.0, + "step": 20864 + }, + { + "epoch": 2.6542424627909935, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.861680269241333, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8595731854438782, + "num_tokens": 796215218.0, + "step": 20865 + }, + { + "epoch": 2.654369673069584, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8602322340011597, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8787826299667358, + "num_tokens": 796249165.0, + "step": 20866 + }, + { + "epoch": 2.6544968833481746, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.874332308769226, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8636187314987183, + "num_tokens": 796283878.0, + "step": 20867 + }, + { + "epoch": 2.654624093626765, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7643115520477295, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8538382053375244, + "num_tokens": 796330760.0, + "step": 20868 + }, + { + "epoch": 2.6547513039053556, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.931064486503601, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8535987138748169, + "num_tokens": 796371547.0, + "step": 20869 + }, + { + "epoch": 2.654878514183946, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8846503496170044, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.851334273815155, + "num_tokens": 796409893.0, + "step": 20870 + }, + { + "epoch": 2.6550057244625367, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0350699424743652, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8508095741271973, + "num_tokens": 796442493.0, + "step": 20871 + }, + { + "epoch": 2.655132934741127, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9163192510604858, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8684611320495605, + "num_tokens": 796480853.0, + "step": 20872 + }, + { + "epoch": 2.6552601450197177, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7937334775924683, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8695582747459412, + "num_tokens": 796520057.0, + "step": 20873 + }, + { + "epoch": 2.6553873552983083, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0096960067749023, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8689494132995605, + "num_tokens": 796549779.0, + "step": 20874 + }, + { + "epoch": 2.6555145655768984, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0174922943115234, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.869429349899292, + "num_tokens": 796590923.0, + "step": 20875 + }, + { + "epoch": 2.6556417758554893, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 6.414922714233398, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.861818253993988, + "num_tokens": 796629455.0, + "step": 20876 + }, + { + "epoch": 2.6557689861340794, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.1825826168060303, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8623577356338501, + "num_tokens": 796669950.0, + "step": 20877 + }, + { + "epoch": 2.6558961964126704, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9286165237426758, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8544785380363464, + "num_tokens": 796712263.0, + "step": 20878 + }, + { + "epoch": 2.6560234066912605, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8413971662521362, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8650378584861755, + "num_tokens": 796753025.0, + "step": 20879 + }, + { + "epoch": 2.6561506169698514, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.167926549911499, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8784617185592651, + "num_tokens": 796788810.0, + "step": 20880 + }, + { + "epoch": 2.6562778272484415, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8556199073791504, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8678823709487915, + "num_tokens": 796826275.0, + "step": 20881 + }, + { + "epoch": 2.656405037527032, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1657490730285645, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8604700565338135, + "num_tokens": 796859587.0, + "step": 20882 + }, + { + "epoch": 2.6565322478056226, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.841939926147461, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8722406029701233, + "num_tokens": 796899440.0, + "step": 20883 + }, + { + "epoch": 2.656659458084213, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.931962013244629, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8553720712661743, + "num_tokens": 796938703.0, + "step": 20884 + }, + { + "epoch": 2.6567866683628036, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.905795693397522, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8515595197677612, + "num_tokens": 796982287.0, + "step": 20885 + }, + { + "epoch": 2.656913878641394, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.33701491355896, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.86760413646698, + "num_tokens": 797017640.0, + "step": 20886 + }, + { + "epoch": 2.6570410889199847, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.004408836364746, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.849958062171936, + "num_tokens": 797054050.0, + "step": 20887 + }, + { + "epoch": 2.6571682991985752, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7208325862884521, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8700804114341736, + "num_tokens": 797092429.0, + "step": 20888 + }, + { + "epoch": 2.6572955094771658, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0094947814941406, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8623427152633667, + "num_tokens": 797125646.0, + "step": 20889 + }, + { + "epoch": 2.6574227197557563, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7682342529296875, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8697561621665955, + "num_tokens": 797164560.0, + "step": 20890 + }, + { + "epoch": 2.657549930034347, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2403619289398193, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.863863468170166, + "num_tokens": 797194168.0, + "step": 20891 + }, + { + "epoch": 2.6576771403129373, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9637064933776855, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8857425451278687, + "num_tokens": 797227045.0, + "step": 20892 + }, + { + "epoch": 2.657804350591528, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7654553651809692, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8807259798049927, + "num_tokens": 797267607.0, + "step": 20893 + }, + { + "epoch": 2.6579315608701184, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8198657035827637, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8562333583831787, + "num_tokens": 797309320.0, + "step": 20894 + }, + { + "epoch": 2.658058771148709, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9779423475265503, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8626519441604614, + "num_tokens": 797348258.0, + "step": 20895 + }, + { + "epoch": 2.6581859814272994, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.937678337097168, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8654429912567139, + "num_tokens": 797391563.0, + "step": 20896 + }, + { + "epoch": 2.65831319170589, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.801523208618164, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8753156661987305, + "num_tokens": 797428314.0, + "step": 20897 + }, + { + "epoch": 2.6584404019844805, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0006232261657715, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8441494703292847, + "num_tokens": 797464480.0, + "step": 20898 + }, + { + "epoch": 2.658567612263071, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8208889961242676, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8659886121749878, + "num_tokens": 797504135.0, + "step": 20899 + }, + { + "epoch": 2.658694822541661, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8814489841461182, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8676296472549438, + "num_tokens": 797544222.0, + "step": 20900 + }, + { + "epoch": 2.658822032820252, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8731106519699097, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8704875111579895, + "num_tokens": 797583695.0, + "step": 20901 + }, + { + "epoch": 2.658949243098842, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9433202743530273, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8505485653877258, + "num_tokens": 797625660.0, + "step": 20902 + }, + { + "epoch": 2.659076453377433, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8008157014846802, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8642352819442749, + "num_tokens": 797665807.0, + "step": 20903 + }, + { + "epoch": 2.6592036636560232, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.780193567276001, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8468919992446899, + "num_tokens": 797707935.0, + "step": 20904 + }, + { + "epoch": 2.6593308739346138, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0606534481048584, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.858401358127594, + "num_tokens": 797745502.0, + "step": 20905 + }, + { + "epoch": 2.6594580842132043, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.151620864868164, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8636258244514465, + "num_tokens": 797779412.0, + "step": 20906 + }, + { + "epoch": 2.659585294491795, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3086960315704346, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8692362308502197, + "num_tokens": 797816677.0, + "step": 20907 + }, + { + "epoch": 2.6597125047703853, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0546278953552246, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8469672203063965, + "num_tokens": 797854260.0, + "step": 20908 + }, + { + "epoch": 2.659839715048976, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.038243532180786, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8535023927688599, + "num_tokens": 797892349.0, + "step": 20909 + }, + { + "epoch": 2.6599669253275664, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9412389993667603, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8675988912582397, + "num_tokens": 797928028.0, + "step": 20910 + }, + { + "epoch": 2.660094135606157, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.251450777053833, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8504603505134583, + "num_tokens": 797963885.0, + "step": 20911 + }, + { + "epoch": 2.6602213458847475, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9144316911697388, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8507146835327148, + "num_tokens": 798009842.0, + "step": 20912 + }, + { + "epoch": 2.660348556163338, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9897232055664062, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8767059445381165, + "num_tokens": 798043207.0, + "step": 20913 + }, + { + "epoch": 2.6604757664419285, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.210721492767334, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8797158598899841, + "num_tokens": 798080887.0, + "step": 20914 + }, + { + "epoch": 2.660602976720519, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8126798868179321, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8735572695732117, + "num_tokens": 798126393.0, + "step": 20915 + }, + { + "epoch": 2.6607301869991096, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7962212562561035, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8813263177871704, + "num_tokens": 798166099.0, + "step": 20916 + }, + { + "epoch": 2.6608573972777, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0050666332244873, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8596366047859192, + "num_tokens": 798202285.0, + "step": 20917 + }, + { + "epoch": 2.6609846075562906, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9048393964767456, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8729966878890991, + "num_tokens": 798243463.0, + "step": 20918 + }, + { + "epoch": 2.661111817834881, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.922858476638794, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.867289125919342, + "num_tokens": 798275507.0, + "step": 20919 + }, + { + "epoch": 2.6612390281134717, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9810253381729126, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8847205638885498, + "num_tokens": 798310536.0, + "step": 20920 + }, + { + "epoch": 2.661366238392062, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.200679063796997, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8604695796966553, + "num_tokens": 798347874.0, + "step": 20921 + }, + { + "epoch": 2.6614934486706527, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.950322151184082, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8891459703445435, + "num_tokens": 798381865.0, + "step": 20922 + }, + { + "epoch": 2.661620658949243, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.81760573387146, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8632757067680359, + "num_tokens": 798423378.0, + "step": 20923 + }, + { + "epoch": 2.661747869227834, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.116344451904297, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8720494508743286, + "num_tokens": 798460278.0, + "step": 20924 + }, + { + "epoch": 2.661875079506424, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.872401475906372, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8672108054161072, + "num_tokens": 798496622.0, + "step": 20925 + }, + { + "epoch": 2.662002289785015, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8491480350494385, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8603085279464722, + "num_tokens": 798533061.0, + "step": 20926 + }, + { + "epoch": 2.662129500063605, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0534698963165283, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8578759431838989, + "num_tokens": 798573689.0, + "step": 20927 + }, + { + "epoch": 2.662256710342196, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.907976746559143, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8738493323326111, + "num_tokens": 798614058.0, + "step": 20928 + }, + { + "epoch": 2.662383920620786, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9214378595352173, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8555567264556885, + "num_tokens": 798657769.0, + "step": 20929 + }, + { + "epoch": 2.6625111308993765, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7652004957199097, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8687484264373779, + "num_tokens": 798701084.0, + "step": 20930 + }, + { + "epoch": 2.662638341177967, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8450349569320679, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8717737197875977, + "num_tokens": 798738029.0, + "step": 20931 + }, + { + "epoch": 2.6627655514565576, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.951678991317749, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8609331846237183, + "num_tokens": 798773537.0, + "step": 20932 + }, + { + "epoch": 2.662892761735148, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9647369384765625, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8630051016807556, + "num_tokens": 798810397.0, + "step": 20933 + }, + { + "epoch": 2.6630199720137386, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.89988374710083, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8612631559371948, + "num_tokens": 798850796.0, + "step": 20934 + }, + { + "epoch": 2.663147182292329, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8289837837219238, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.865912914276123, + "num_tokens": 798892379.0, + "step": 20935 + }, + { + "epoch": 2.6632743925709197, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9347259998321533, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8527282476425171, + "num_tokens": 798929444.0, + "step": 20936 + }, + { + "epoch": 2.66340160284951, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.007950782775879, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8704342246055603, + "num_tokens": 798964086.0, + "step": 20937 + }, + { + "epoch": 2.6635288131281007, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8595153093338013, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8726812601089478, + "num_tokens": 799007408.0, + "step": 20938 + }, + { + "epoch": 2.6636560234066913, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9070210456848145, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.870049238204956, + "num_tokens": 799045930.0, + "step": 20939 + }, + { + "epoch": 2.663783233685282, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0163111686706543, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.877096951007843, + "num_tokens": 799083568.0, + "step": 20940 + }, + { + "epoch": 2.6639104439638723, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1073062419891357, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8619746565818787, + "num_tokens": 799123681.0, + "step": 20941 + }, + { + "epoch": 2.664037654242463, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9902279376983643, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8615278005599976, + "num_tokens": 799164623.0, + "step": 20942 + }, + { + "epoch": 2.6641648645210534, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9801558256149292, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8650661706924438, + "num_tokens": 799200408.0, + "step": 20943 + }, + { + "epoch": 2.664292074799644, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8886431455612183, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8553104400634766, + "num_tokens": 799245106.0, + "step": 20944 + }, + { + "epoch": 2.6644192850782344, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 3.062972068786621, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8733828067779541, + "num_tokens": 799290429.0, + "step": 20945 + }, + { + "epoch": 2.664546495356825, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.114546060562134, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8639020323753357, + "num_tokens": 799322951.0, + "step": 20946 + }, + { + "epoch": 2.6646737056354155, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9542406797409058, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.86273592710495, + "num_tokens": 799365498.0, + "step": 20947 + }, + { + "epoch": 2.6648009159140056, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.80457603931427, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8605765104293823, + "num_tokens": 799405894.0, + "step": 20948 + }, + { + "epoch": 2.6649281261925966, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.04463791847229, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8635531067848206, + "num_tokens": 799439750.0, + "step": 20949 + }, + { + "epoch": 2.6650553364711866, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0255844593048096, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.856970489025116, + "num_tokens": 799475170.0, + "step": 20950 + }, + { + "epoch": 2.6651825467497776, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0542147159576416, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8517943620681763, + "num_tokens": 799514231.0, + "step": 20951 + }, + { + "epoch": 2.6653097570283677, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.908323049545288, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8655800819396973, + "num_tokens": 799549512.0, + "step": 20952 + }, + { + "epoch": 2.6654369673069587, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9521113634109497, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8603314757347107, + "num_tokens": 799592144.0, + "step": 20953 + }, + { + "epoch": 2.6655641775855488, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 3.027852773666382, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8510571718215942, + "num_tokens": 799634779.0, + "step": 20954 + }, + { + "epoch": 2.6656913878641393, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.8988165855407715, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8773622512817383, + "num_tokens": 799677548.0, + "step": 20955 + }, + { + "epoch": 2.66581859814273, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.949971318244934, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8480375409126282, + "num_tokens": 799715679.0, + "step": 20956 + }, + { + "epoch": 2.6659458084213203, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9024498462677002, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.855991005897522, + "num_tokens": 799755701.0, + "step": 20957 + }, + { + "epoch": 2.666073018699911, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8594589233398438, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8595669865608215, + "num_tokens": 799795151.0, + "step": 20958 + }, + { + "epoch": 2.6662002289785014, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8623164892196655, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8715968132019043, + "num_tokens": 799832249.0, + "step": 20959 + }, + { + "epoch": 2.666327439257092, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9166370630264282, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8435530662536621, + "num_tokens": 799867762.0, + "step": 20960 + }, + { + "epoch": 2.6664546495356825, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8382799625396729, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8622766733169556, + "num_tokens": 799909012.0, + "step": 20961 + }, + { + "epoch": 2.666581859814273, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7580299377441406, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8610693216323853, + "num_tokens": 799950316.0, + "step": 20962 + }, + { + "epoch": 2.6667090700928635, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8284101486206055, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8797988891601562, + "num_tokens": 799990037.0, + "step": 20963 + }, + { + "epoch": 2.666836280371454, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9324922561645508, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.874980092048645, + "num_tokens": 800023935.0, + "step": 20964 + }, + { + "epoch": 2.6669634906500446, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.791208028793335, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8725184202194214, + "num_tokens": 800066357.0, + "step": 20965 + }, + { + "epoch": 2.667090700928635, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2984366416931152, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8507215976715088, + "num_tokens": 800100667.0, + "step": 20966 + }, + { + "epoch": 2.6672179112072256, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9410629272460938, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8691336512565613, + "num_tokens": 800135101.0, + "step": 20967 + }, + { + "epoch": 2.667345121485816, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8911606073379517, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8805979490280151, + "num_tokens": 800176308.0, + "step": 20968 + }, + { + "epoch": 2.6674723317644067, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3607940673828125, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8758903741836548, + "num_tokens": 800205585.0, + "step": 20969 + }, + { + "epoch": 2.667599542042997, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9115078449249268, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8648074865341187, + "num_tokens": 800241400.0, + "step": 20970 + }, + { + "epoch": 2.6677267523215877, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0025746822357178, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8734155893325806, + "num_tokens": 800279060.0, + "step": 20971 + }, + { + "epoch": 2.6678539626001783, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.839099645614624, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.874091625213623, + "num_tokens": 800318606.0, + "step": 20972 + }, + { + "epoch": 2.6679811728787683, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9401638507843018, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8649700880050659, + "num_tokens": 800356795.0, + "step": 20973 + }, + { + "epoch": 2.6681083831573593, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0712971687316895, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8741887807846069, + "num_tokens": 800393786.0, + "step": 20974 + }, + { + "epoch": 2.6682355934359494, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8571168184280396, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.862401008605957, + "num_tokens": 800440153.0, + "step": 20975 + }, + { + "epoch": 2.6683628037145404, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.96683931350708, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8472487926483154, + "num_tokens": 800476343.0, + "step": 20976 + }, + { + "epoch": 2.6684900139931305, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7615230083465576, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8784818649291992, + "num_tokens": 800514227.0, + "step": 20977 + }, + { + "epoch": 2.668617224271721, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9589489698410034, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8642088174819946, + "num_tokens": 800552357.0, + "step": 20978 + }, + { + "epoch": 2.6687444345503115, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9146369695663452, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8741754293441772, + "num_tokens": 800587624.0, + "step": 20979 + }, + { + "epoch": 2.668871644828902, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8924998044967651, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8589308261871338, + "num_tokens": 800631183.0, + "step": 20980 + }, + { + "epoch": 2.6689988551074926, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8466466665267944, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8685644865036011, + "num_tokens": 800670334.0, + "step": 20981 + }, + { + "epoch": 2.669126065386083, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0158283710479736, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8621623516082764, + "num_tokens": 800703500.0, + "step": 20982 + }, + { + "epoch": 2.6692532756646736, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8865140676498413, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8583844304084778, + "num_tokens": 800742904.0, + "step": 20983 + }, + { + "epoch": 2.669380485943264, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8823667764663696, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8626648187637329, + "num_tokens": 800777556.0, + "step": 20984 + }, + { + "epoch": 2.6695076962218547, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9189716577529907, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8569067716598511, + "num_tokens": 800817532.0, + "step": 20985 + }, + { + "epoch": 2.669634906500445, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7732033729553223, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8750541806221008, + "num_tokens": 800856567.0, + "step": 20986 + }, + { + "epoch": 2.6697621167790357, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8991934061050415, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8431084156036377, + "num_tokens": 800901234.0, + "step": 20987 + }, + { + "epoch": 2.6698893270576263, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0930166244506836, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8625984787940979, + "num_tokens": 800942647.0, + "step": 20988 + }, + { + "epoch": 2.670016537336217, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.833649754524231, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8716976046562195, + "num_tokens": 800980194.0, + "step": 20989 + }, + { + "epoch": 2.6701437476148073, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0106542110443115, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8698177337646484, + "num_tokens": 801016978.0, + "step": 20990 + }, + { + "epoch": 2.670270957893398, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0571329593658447, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8513492345809937, + "num_tokens": 801051921.0, + "step": 20991 + }, + { + "epoch": 2.6703981681719884, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0916526317596436, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8397482633590698, + "num_tokens": 801087189.0, + "step": 20992 + }, + { + "epoch": 2.670525378450579, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9565486907958984, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8713018298149109, + "num_tokens": 801121574.0, + "step": 20993 + }, + { + "epoch": 2.6706525887291694, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7955985069274902, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8706305027008057, + "num_tokens": 801160497.0, + "step": 20994 + }, + { + "epoch": 2.67077979900776, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8703309297561646, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8676149249076843, + "num_tokens": 801198556.0, + "step": 20995 + }, + { + "epoch": 2.6709070092863505, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0631051063537598, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8918077945709229, + "num_tokens": 801230616.0, + "step": 20996 + }, + { + "epoch": 2.671034219564941, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9824718236923218, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8631873726844788, + "num_tokens": 801266582.0, + "step": 20997 + }, + { + "epoch": 2.671161429843531, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8607959747314453, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8766582608222961, + "num_tokens": 801305127.0, + "step": 20998 + }, + { + "epoch": 2.671288640122122, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0197622776031494, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8703283071517944, + "num_tokens": 801348284.0, + "step": 20999 + }, + { + "epoch": 2.671415850400712, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7512478828430176, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8864243030548096, + "num_tokens": 801385298.0, + "step": 21000 + }, + { + "epoch": 2.671543060679303, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0900585651397705, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8577529191970825, + "num_tokens": 801417020.0, + "step": 21001 + }, + { + "epoch": 2.6716702709578932, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.851250171661377, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.859688401222229, + "num_tokens": 801455328.0, + "step": 21002 + }, + { + "epoch": 2.6717974812364838, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.763929009437561, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8755836486816406, + "num_tokens": 801494342.0, + "step": 21003 + }, + { + "epoch": 2.6719246915150743, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9335211515426636, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8686723709106445, + "num_tokens": 801528182.0, + "step": 21004 + }, + { + "epoch": 2.672051901793665, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.797913670539856, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8642603158950806, + "num_tokens": 801569424.0, + "step": 21005 + }, + { + "epoch": 2.6721791120722553, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9606118202209473, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8645471334457397, + "num_tokens": 801608809.0, + "step": 21006 + }, + { + "epoch": 2.672306322350846, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.848418951034546, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8563631176948547, + "num_tokens": 801651910.0, + "step": 21007 + }, + { + "epoch": 2.6724335326294364, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8064838647842407, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8656842708587646, + "num_tokens": 801692961.0, + "step": 21008 + }, + { + "epoch": 2.672560742908027, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.014106273651123, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8524508476257324, + "num_tokens": 801731278.0, + "step": 21009 + }, + { + "epoch": 2.6726879531866174, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8432308435440063, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8615871667861938, + "num_tokens": 801769562.0, + "step": 21010 + }, + { + "epoch": 2.672815163465208, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8719946146011353, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8625534772872925, + "num_tokens": 801811046.0, + "step": 21011 + }, + { + "epoch": 2.6729423737437985, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.02770733833313, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.842366099357605, + "num_tokens": 801847833.0, + "step": 21012 + }, + { + "epoch": 2.673069584022389, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8946903944015503, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8601794242858887, + "num_tokens": 801887819.0, + "step": 21013 + }, + { + "epoch": 2.6731967943009796, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9055284261703491, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8777408599853516, + "num_tokens": 801922535.0, + "step": 21014 + }, + { + "epoch": 2.67332400457957, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8423445224761963, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.870115339756012, + "num_tokens": 801958025.0, + "step": 21015 + }, + { + "epoch": 2.6734512148581606, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.83891761302948, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8686894178390503, + "num_tokens": 801991291.0, + "step": 21016 + }, + { + "epoch": 2.673578425136751, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9342575073242188, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8674171566963196, + "num_tokens": 802024952.0, + "step": 21017 + }, + { + "epoch": 2.6737056354153417, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8394219875335693, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8639253377914429, + "num_tokens": 802065741.0, + "step": 21018 + }, + { + "epoch": 2.673832845693932, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9972678422927856, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8637862801551819, + "num_tokens": 802107153.0, + "step": 21019 + }, + { + "epoch": 2.6739600559725227, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7726233005523682, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8694756031036377, + "num_tokens": 802148177.0, + "step": 21020 + }, + { + "epoch": 2.674087266251113, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8261475563049316, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8662098050117493, + "num_tokens": 802186144.0, + "step": 21021 + }, + { + "epoch": 2.674214476529704, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8603898286819458, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8800128698348999, + "num_tokens": 802224054.0, + "step": 21022 + }, + { + "epoch": 2.674341686808294, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0250208377838135, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8488144278526306, + "num_tokens": 802264483.0, + "step": 21023 + }, + { + "epoch": 2.674468897086885, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1846377849578857, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.872175395488739, + "num_tokens": 802302933.0, + "step": 21024 + }, + { + "epoch": 2.674596107365475, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8162587881088257, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8587108850479126, + "num_tokens": 802347354.0, + "step": 21025 + }, + { + "epoch": 2.674723317644066, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8311620950698853, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8748733401298523, + "num_tokens": 802392159.0, + "step": 21026 + }, + { + "epoch": 2.674850527922656, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7742338180541992, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8678609728813171, + "num_tokens": 802432629.0, + "step": 21027 + }, + { + "epoch": 2.6749777382012465, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7127858400344849, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.874687910079956, + "num_tokens": 802471776.0, + "step": 21028 + }, + { + "epoch": 2.675104948479837, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9995594024658203, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8580291271209717, + "num_tokens": 802507314.0, + "step": 21029 + }, + { + "epoch": 2.6752321587584276, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7742340564727783, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8745402693748474, + "num_tokens": 802550587.0, + "step": 21030 + }, + { + "epoch": 2.675359369037018, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8288731575012207, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8650660514831543, + "num_tokens": 802591544.0, + "step": 21031 + }, + { + "epoch": 2.6754865793156086, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8070406913757324, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8739933371543884, + "num_tokens": 802627336.0, + "step": 21032 + }, + { + "epoch": 2.675613789594199, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.805495023727417, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8657158613204956, + "num_tokens": 802668358.0, + "step": 21033 + }, + { + "epoch": 2.6757409998727897, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7678941488265991, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8726130723953247, + "num_tokens": 802708023.0, + "step": 21034 + }, + { + "epoch": 2.67586821015138, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8585689067840576, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8877689838409424, + "num_tokens": 802745856.0, + "step": 21035 + }, + { + "epoch": 2.6759954204299707, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.891858696937561, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8765774369239807, + "num_tokens": 802779032.0, + "step": 21036 + }, + { + "epoch": 2.6761226307085613, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.191735029220581, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8654471635818481, + "num_tokens": 802815736.0, + "step": 21037 + }, + { + "epoch": 2.676249840987152, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9329510927200317, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8601720929145813, + "num_tokens": 802854133.0, + "step": 21038 + }, + { + "epoch": 2.6763770512657423, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7699170112609863, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8724871873855591, + "num_tokens": 802892969.0, + "step": 21039 + }, + { + "epoch": 2.676504261544333, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.823843002319336, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8816684484481812, + "num_tokens": 802936998.0, + "step": 21040 + }, + { + "epoch": 2.6766314718229234, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.776969313621521, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8734984397888184, + "num_tokens": 802979040.0, + "step": 21041 + }, + { + "epoch": 2.676758682101514, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8552582263946533, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8764335513114929, + "num_tokens": 803016516.0, + "step": 21042 + }, + { + "epoch": 2.6768858923801044, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8719556331634521, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8637136816978455, + "num_tokens": 803055566.0, + "step": 21043 + }, + { + "epoch": 2.677013102658695, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7634893655776978, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8687068223953247, + "num_tokens": 803093683.0, + "step": 21044 + }, + { + "epoch": 2.6771403129372855, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.719564437866211, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8744025230407715, + "num_tokens": 803133691.0, + "step": 21045 + }, + { + "epoch": 2.6772675232158756, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0259766578674316, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8757036924362183, + "num_tokens": 803169870.0, + "step": 21046 + }, + { + "epoch": 2.6773947334944665, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0519633293151855, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8581047654151917, + "num_tokens": 803209038.0, + "step": 21047 + }, + { + "epoch": 2.6775219437730566, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1123909950256348, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8515917062759399, + "num_tokens": 803239725.0, + "step": 21048 + }, + { + "epoch": 2.6776491540516476, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0094199180603027, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8665750026702881, + "num_tokens": 803273580.0, + "step": 21049 + }, + { + "epoch": 2.6777763643302377, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7922991514205933, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8721644878387451, + "num_tokens": 803313583.0, + "step": 21050 + }, + { + "epoch": 2.6779035746088287, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9223283529281616, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8733144402503967, + "num_tokens": 803347517.0, + "step": 21051 + }, + { + "epoch": 2.6780307848874187, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.930423378944397, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8531632423400879, + "num_tokens": 803388183.0, + "step": 21052 + }, + { + "epoch": 2.6781579951660093, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9453239440917969, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8674119710922241, + "num_tokens": 803428377.0, + "step": 21053 + }, + { + "epoch": 2.6782852054446, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.894277572631836, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.876262366771698, + "num_tokens": 803460930.0, + "step": 21054 + }, + { + "epoch": 2.6784124157231903, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8707455396652222, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8751461505889893, + "num_tokens": 803497971.0, + "step": 21055 + }, + { + "epoch": 2.678539626001781, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.862976312637329, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8814394474029541, + "num_tokens": 803535514.0, + "step": 21056 + }, + { + "epoch": 2.6786668362803714, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.021972417831421, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8610073328018188, + "num_tokens": 803580767.0, + "step": 21057 + }, + { + "epoch": 2.678794046558962, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.055485486984253, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.868381679058075, + "num_tokens": 803619168.0, + "step": 21058 + }, + { + "epoch": 2.6789212568375524, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.278376579284668, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8515461087226868, + "num_tokens": 803654785.0, + "step": 21059 + }, + { + "epoch": 2.679048467116143, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7817227840423584, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8747356534004211, + "num_tokens": 803692505.0, + "step": 21060 + }, + { + "epoch": 2.6791756773947335, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7856574058532715, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8735679388046265, + "num_tokens": 803732592.0, + "step": 21061 + }, + { + "epoch": 2.679302887673324, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9423261880874634, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8547298312187195, + "num_tokens": 803767601.0, + "step": 21062 + }, + { + "epoch": 2.6794300979519146, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1281869411468506, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8582033514976501, + "num_tokens": 803804329.0, + "step": 21063 + }, + { + "epoch": 2.679557308230505, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7660667896270752, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8618952035903931, + "num_tokens": 803846182.0, + "step": 21064 + }, + { + "epoch": 2.6796845185090956, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.08905291557312, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8615790605545044, + "num_tokens": 803883561.0, + "step": 21065 + }, + { + "epoch": 2.679811728787686, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0467369556427, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8854836225509644, + "num_tokens": 803916860.0, + "step": 21066 + }, + { + "epoch": 2.6799389390662767, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.060805320739746, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.866015613079071, + "num_tokens": 803951949.0, + "step": 21067 + }, + { + "epoch": 2.680066149344867, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9771764278411865, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.86065673828125, + "num_tokens": 803992094.0, + "step": 21068 + }, + { + "epoch": 2.6801933596234577, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.328270196914673, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8645520806312561, + "num_tokens": 804034058.0, + "step": 21069 + }, + { + "epoch": 2.6803205699020483, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9049627780914307, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8580758571624756, + "num_tokens": 804072722.0, + "step": 21070 + }, + { + "epoch": 2.6804477801806383, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6730526685714722, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8726806044578552, + "num_tokens": 804114253.0, + "step": 21071 + }, + { + "epoch": 2.6805749904592293, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7842531204223633, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8683679103851318, + "num_tokens": 804154877.0, + "step": 21072 + }, + { + "epoch": 2.6807022007378194, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2162768840789795, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8613607883453369, + "num_tokens": 804191060.0, + "step": 21073 + }, + { + "epoch": 2.6808294110164104, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9553744792938232, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.857246994972229, + "num_tokens": 804231574.0, + "step": 21074 + }, + { + "epoch": 2.6809566212950005, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1601898670196533, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8453770875930786, + "num_tokens": 804264027.0, + "step": 21075 + }, + { + "epoch": 2.681083831573591, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1086530685424805, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8676602840423584, + "num_tokens": 804302906.0, + "step": 21076 + }, + { + "epoch": 2.6812110418521815, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.155545949935913, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8605188131332397, + "num_tokens": 804339107.0, + "step": 21077 + }, + { + "epoch": 2.681338252130772, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5441665649414062, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8702210187911987, + "num_tokens": 804374292.0, + "step": 21078 + }, + { + "epoch": 2.6814654624093626, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.85940682888031, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8688746690750122, + "num_tokens": 804420287.0, + "step": 21079 + }, + { + "epoch": 2.681592672687953, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9914685487747192, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8713330030441284, + "num_tokens": 804453977.0, + "step": 21080 + }, + { + "epoch": 2.6817198829665436, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9462083578109741, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8501622676849365, + "num_tokens": 804494958.0, + "step": 21081 + }, + { + "epoch": 2.681847093245134, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0703511238098145, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8461819887161255, + "num_tokens": 804532209.0, + "step": 21082 + }, + { + "epoch": 2.6819743035237247, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9340462684631348, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8681349754333496, + "num_tokens": 804572937.0, + "step": 21083 + }, + { + "epoch": 2.682101513802315, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9753528833389282, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8655380010604858, + "num_tokens": 804610263.0, + "step": 21084 + }, + { + "epoch": 2.6822287240809057, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9510958194732666, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8698906302452087, + "num_tokens": 804645516.0, + "step": 21085 + }, + { + "epoch": 2.6823559343594963, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7954882383346558, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8777902722358704, + "num_tokens": 804684540.0, + "step": 21086 + }, + { + "epoch": 2.682483144638087, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8500858545303345, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8683508038520813, + "num_tokens": 804723030.0, + "step": 21087 + }, + { + "epoch": 2.6826103549166773, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.853131890296936, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8744765520095825, + "num_tokens": 804759202.0, + "step": 21088 + }, + { + "epoch": 2.682737565195268, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.844245433807373, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8626667261123657, + "num_tokens": 804803199.0, + "step": 21089 + }, + { + "epoch": 2.6828647754738584, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.982229471206665, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8635183572769165, + "num_tokens": 804835675.0, + "step": 21090 + }, + { + "epoch": 2.682991985752449, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9073477983474731, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8526049256324768, + "num_tokens": 804876583.0, + "step": 21091 + }, + { + "epoch": 2.6831191960310394, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.260871171951294, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8554285168647766, + "num_tokens": 804906110.0, + "step": 21092 + }, + { + "epoch": 2.68324640630963, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.930640459060669, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8570072650909424, + "num_tokens": 804942196.0, + "step": 21093 + }, + { + "epoch": 2.6833736165882205, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8719595670700073, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8641072511672974, + "num_tokens": 804978760.0, + "step": 21094 + }, + { + "epoch": 2.683500826866811, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9918243885040283, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8578446507453918, + "num_tokens": 805014212.0, + "step": 21095 + }, + { + "epoch": 2.683628037145401, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.807674765586853, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8897005915641785, + "num_tokens": 805050528.0, + "step": 21096 + }, + { + "epoch": 2.683755247423992, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1992361545562744, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8690939545631409, + "num_tokens": 805087355.0, + "step": 21097 + }, + { + "epoch": 2.683882457702582, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0259640216827393, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8517078757286072, + "num_tokens": 805117837.0, + "step": 21098 + }, + { + "epoch": 2.684009667981173, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8551709651947021, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8649532794952393, + "num_tokens": 805157734.0, + "step": 21099 + }, + { + "epoch": 2.684136878259763, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.974034070968628, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8724154233932495, + "num_tokens": 805190657.0, + "step": 21100 + }, + { + "epoch": 2.6842640885383537, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9175878763198853, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8579424619674683, + "num_tokens": 805229654.0, + "step": 21101 + }, + { + "epoch": 2.6843912988169443, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8066781759262085, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8657040596008301, + "num_tokens": 805269199.0, + "step": 21102 + }, + { + "epoch": 2.684518509095535, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9445444345474243, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8604216575622559, + "num_tokens": 805308918.0, + "step": 21103 + }, + { + "epoch": 2.6846457193741253, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0572617053985596, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8617801070213318, + "num_tokens": 805345014.0, + "step": 21104 + }, + { + "epoch": 2.684772929652716, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7913925647735596, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8787447214126587, + "num_tokens": 805383674.0, + "step": 21105 + }, + { + "epoch": 2.6849001399313064, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9030771255493164, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8690521717071533, + "num_tokens": 805419765.0, + "step": 21106 + }, + { + "epoch": 2.685027350209897, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8906502723693848, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8660884499549866, + "num_tokens": 805461346.0, + "step": 21107 + }, + { + "epoch": 2.6851545604884874, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.025049924850464, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8724389672279358, + "num_tokens": 805494519.0, + "step": 21108 + }, + { + "epoch": 2.685281770767078, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9658079147338867, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8547133803367615, + "num_tokens": 805530542.0, + "step": 21109 + }, + { + "epoch": 2.6854089810456685, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9937506914138794, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8625208139419556, + "num_tokens": 805564531.0, + "step": 21110 + }, + { + "epoch": 2.685536191324259, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8770521879196167, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8612414598464966, + "num_tokens": 805606011.0, + "step": 21111 + }, + { + "epoch": 2.6856634016028496, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.199467658996582, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.860862135887146, + "num_tokens": 805643078.0, + "step": 21112 + }, + { + "epoch": 2.68579061188144, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.028881072998047, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8709591031074524, + "num_tokens": 805678554.0, + "step": 21113 + }, + { + "epoch": 2.6859178221600306, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.921125888824463, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8764084577560425, + "num_tokens": 805713736.0, + "step": 21114 + }, + { + "epoch": 2.686045032438621, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8531640768051147, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8599468469619751, + "num_tokens": 805751798.0, + "step": 21115 + }, + { + "epoch": 2.6861722427172117, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.831335186958313, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8733429908752441, + "num_tokens": 805789205.0, + "step": 21116 + }, + { + "epoch": 2.686299452995802, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8195703029632568, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8693438172340393, + "num_tokens": 805824910.0, + "step": 21117 + }, + { + "epoch": 2.6864266632743927, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9932724237442017, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8613905906677246, + "num_tokens": 805861335.0, + "step": 21118 + }, + { + "epoch": 2.686553873552983, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8126635551452637, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8613324165344238, + "num_tokens": 805901488.0, + "step": 21119 + }, + { + "epoch": 2.686681083831574, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0161290168762207, + "learning_rate": 1e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.827946126461029, + "num_tokens": 805941336.0, + "step": 21120 + }, + { + "epoch": 2.686808294110164, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9808120727539062, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8733462691307068, + "num_tokens": 805974222.0, + "step": 21121 + }, + { + "epoch": 2.686935504388755, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8340308666229248, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8613150119781494, + "num_tokens": 806014942.0, + "step": 21122 + }, + { + "epoch": 2.687062714667345, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8920555114746094, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8771876692771912, + "num_tokens": 806048894.0, + "step": 21123 + }, + { + "epoch": 2.687189924945936, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.85404372215271, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8638193607330322, + "num_tokens": 806089714.0, + "step": 21124 + }, + { + "epoch": 2.687317135224526, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8132658004760742, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.853545069694519, + "num_tokens": 806128974.0, + "step": 21125 + }, + { + "epoch": 2.6874443455031165, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8508340120315552, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8633660078048706, + "num_tokens": 806169857.0, + "step": 21126 + }, + { + "epoch": 2.687571555781707, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2515506744384766, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8676062822341919, + "num_tokens": 806202826.0, + "step": 21127 + }, + { + "epoch": 2.6876987660602976, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9259538650512695, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8632741570472717, + "num_tokens": 806242850.0, + "step": 21128 + }, + { + "epoch": 2.687825976338888, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9669708013534546, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8680933117866516, + "num_tokens": 806279024.0, + "step": 21129 + }, + { + "epoch": 2.6879531866174786, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8836500644683838, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8496845364570618, + "num_tokens": 806313669.0, + "step": 21130 + }, + { + "epoch": 2.688080396896069, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8246209621429443, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8686299324035645, + "num_tokens": 806353128.0, + "step": 21131 + }, + { + "epoch": 2.6882076071746597, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0827417373657227, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8661941289901733, + "num_tokens": 806385044.0, + "step": 21132 + }, + { + "epoch": 2.68833481745325, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9139236211776733, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8573821187019348, + "num_tokens": 806422157.0, + "step": 21133 + }, + { + "epoch": 2.6884620277318407, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.761351466178894, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8649033308029175, + "num_tokens": 806463180.0, + "step": 21134 + }, + { + "epoch": 2.6885892380104313, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.953939437866211, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8616851568222046, + "num_tokens": 806502031.0, + "step": 21135 + }, + { + "epoch": 2.688716448289022, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8608264923095703, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8683511018753052, + "num_tokens": 806539585.0, + "step": 21136 + }, + { + "epoch": 2.6888436585676123, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8470702171325684, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8841478228569031, + "num_tokens": 806573147.0, + "step": 21137 + }, + { + "epoch": 2.688970868846203, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.880623459815979, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8782302141189575, + "num_tokens": 806609785.0, + "step": 21138 + }, + { + "epoch": 2.6890980791247934, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9013344049453735, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8718019723892212, + "num_tokens": 806647709.0, + "step": 21139 + }, + { + "epoch": 2.689225289403384, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9035520553588867, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8702125549316406, + "num_tokens": 806684448.0, + "step": 21140 + }, + { + "epoch": 2.6893524996819744, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8769456148147583, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8694446086883545, + "num_tokens": 806725326.0, + "step": 21141 + }, + { + "epoch": 2.689479709960565, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1066060066223145, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8652936816215515, + "num_tokens": 806759797.0, + "step": 21142 + }, + { + "epoch": 2.6896069202391555, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2209646701812744, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8726105690002441, + "num_tokens": 806799986.0, + "step": 21143 + }, + { + "epoch": 2.6897341305177456, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9028663635253906, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8617130517959595, + "num_tokens": 806843055.0, + "step": 21144 + }, + { + "epoch": 2.6898613407963365, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8914732933044434, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8795393705368042, + "num_tokens": 806875897.0, + "step": 21145 + }, + { + "epoch": 2.6899885510749266, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9565917253494263, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8731181621551514, + "num_tokens": 806915583.0, + "step": 21146 + }, + { + "epoch": 2.6901157613535176, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9506802558898926, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8696969151496887, + "num_tokens": 806953851.0, + "step": 21147 + }, + { + "epoch": 2.6902429716321077, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0444607734680176, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8613063097000122, + "num_tokens": 806988519.0, + "step": 21148 + }, + { + "epoch": 2.6903701819106987, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8173786401748657, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.868770956993103, + "num_tokens": 807029612.0, + "step": 21149 + }, + { + "epoch": 2.6904973921892887, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.073629379272461, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8504986763000488, + "num_tokens": 807064009.0, + "step": 21150 + }, + { + "epoch": 2.6906246024678793, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.830246925354004, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8634541630744934, + "num_tokens": 807104694.0, + "step": 21151 + }, + { + "epoch": 2.69075181274647, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9829992055892944, + "learning_rate": 1e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8426871299743652, + "num_tokens": 807142500.0, + "step": 21152 + }, + { + "epoch": 2.6908790230250603, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0572335720062256, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8622036576271057, + "num_tokens": 807172123.0, + "step": 21153 + }, + { + "epoch": 2.691006233303651, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.009817600250244, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8681030869483948, + "num_tokens": 807206422.0, + "step": 21154 + }, + { + "epoch": 2.6911334435822414, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0157721042633057, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8507453203201294, + "num_tokens": 807240867.0, + "step": 21155 + }, + { + "epoch": 2.691260653860832, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8106415271759033, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8752217292785645, + "num_tokens": 807275838.0, + "step": 21156 + }, + { + "epoch": 2.6913878641394224, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.788609504699707, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8723950386047363, + "num_tokens": 807315164.0, + "step": 21157 + }, + { + "epoch": 2.691515074418013, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.837084174156189, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8573662042617798, + "num_tokens": 807358016.0, + "step": 21158 + }, + { + "epoch": 2.6916422846966035, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8678828477859497, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.864355742931366, + "num_tokens": 807402751.0, + "step": 21159 + }, + { + "epoch": 2.691769494975194, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8768616914749146, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8693773746490479, + "num_tokens": 807442761.0, + "step": 21160 + }, + { + "epoch": 2.6918967052537845, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9280422925949097, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.872685432434082, + "num_tokens": 807478550.0, + "step": 21161 + }, + { + "epoch": 2.692023915532375, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.739917278289795, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8737539649009705, + "num_tokens": 807522756.0, + "step": 21162 + }, + { + "epoch": 2.6921511258109656, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.861539363861084, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8703278303146362, + "num_tokens": 807561701.0, + "step": 21163 + }, + { + "epoch": 2.692278336089556, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9231477975845337, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8643977642059326, + "num_tokens": 807604058.0, + "step": 21164 + }, + { + "epoch": 2.6924055463681467, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.009357213973999, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8715040683746338, + "num_tokens": 807637344.0, + "step": 21165 + }, + { + "epoch": 2.692532756646737, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7933348417282104, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8581124544143677, + "num_tokens": 807676867.0, + "step": 21166 + }, + { + "epoch": 2.6926599669253277, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8982218503952026, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8684720396995544, + "num_tokens": 807707183.0, + "step": 21167 + }, + { + "epoch": 2.6927871772039182, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0400609970092773, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8569377660751343, + "num_tokens": 807737617.0, + "step": 21168 + }, + { + "epoch": 2.6929143874825083, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9020932912826538, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8584707975387573, + "num_tokens": 807774141.0, + "step": 21169 + }, + { + "epoch": 2.6930415977610993, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7953091859817505, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8711743354797363, + "num_tokens": 807813375.0, + "step": 21170 + }, + { + "epoch": 2.6931688080396894, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8798565864562988, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8513716459274292, + "num_tokens": 807852317.0, + "step": 21171 + }, + { + "epoch": 2.6932960183182804, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9201914072036743, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8665626049041748, + "num_tokens": 807895266.0, + "step": 21172 + }, + { + "epoch": 2.6934232285968704, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7447948455810547, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8667539358139038, + "num_tokens": 807938600.0, + "step": 21173 + }, + { + "epoch": 2.693550438875461, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.033270835876465, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8854343295097351, + "num_tokens": 807974329.0, + "step": 21174 + }, + { + "epoch": 2.6936776491540515, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.860313057899475, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.877251148223877, + "num_tokens": 808015666.0, + "step": 21175 + }, + { + "epoch": 2.693804859432642, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8278354406356812, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8775120973587036, + "num_tokens": 808050176.0, + "step": 21176 + }, + { + "epoch": 2.6939320697112326, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.993726134300232, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8591647148132324, + "num_tokens": 808091910.0, + "step": 21177 + }, + { + "epoch": 2.694059279989823, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.949344515800476, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8617082834243774, + "num_tokens": 808128646.0, + "step": 21178 + }, + { + "epoch": 2.6941864902684136, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8857742547988892, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8694906830787659, + "num_tokens": 808166544.0, + "step": 21179 + }, + { + "epoch": 2.694313700547004, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.007944345474243, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8543435335159302, + "num_tokens": 808200051.0, + "step": 21180 + }, + { + "epoch": 2.6944409108255947, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 7.746331691741943, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8675237894058228, + "num_tokens": 808239251.0, + "step": 21181 + }, + { + "epoch": 2.694568121104185, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.1134445667266846, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8610295057296753, + "num_tokens": 808280639.0, + "step": 21182 + }, + { + "epoch": 2.6946953313827757, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9334906339645386, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8741824626922607, + "num_tokens": 808316141.0, + "step": 21183 + }, + { + "epoch": 2.6948225416613663, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9911144971847534, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8643608689308167, + "num_tokens": 808359172.0, + "step": 21184 + }, + { + "epoch": 2.694949751939957, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.005150079727173, + "learning_rate": 1e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8372781276702881, + "num_tokens": 808397823.0, + "step": 21185 + }, + { + "epoch": 2.6950769622185473, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8448904752731323, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8754177093505859, + "num_tokens": 808435355.0, + "step": 21186 + }, + { + "epoch": 2.695204172497138, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9280765056610107, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8726421594619751, + "num_tokens": 808470792.0, + "step": 21187 + }, + { + "epoch": 2.6953313827757284, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9618659019470215, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8832677602767944, + "num_tokens": 808504591.0, + "step": 21188 + }, + { + "epoch": 2.695458593054319, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8532946109771729, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8782141208648682, + "num_tokens": 808537167.0, + "step": 21189 + }, + { + "epoch": 2.6955858033329094, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8919421434402466, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.881394624710083, + "num_tokens": 808572055.0, + "step": 21190 + }, + { + "epoch": 2.6957130136115, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8923529386520386, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8784468173980713, + "num_tokens": 808605239.0, + "step": 21191 + }, + { + "epoch": 2.6958402238900905, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.992876648902893, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8430977463722229, + "num_tokens": 808643227.0, + "step": 21192 + }, + { + "epoch": 2.695967434168681, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9457796812057495, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8590465784072876, + "num_tokens": 808683855.0, + "step": 21193 + }, + { + "epoch": 2.696094644447271, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8111207485198975, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8724856376647949, + "num_tokens": 808723467.0, + "step": 21194 + }, + { + "epoch": 2.696221854725862, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0393590927124023, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8540608286857605, + "num_tokens": 808759805.0, + "step": 21195 + }, + { + "epoch": 2.696349065004452, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9887701272964478, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8491223454475403, + "num_tokens": 808802024.0, + "step": 21196 + }, + { + "epoch": 2.696476275283043, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.151745319366455, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8559979796409607, + "num_tokens": 808837086.0, + "step": 21197 + }, + { + "epoch": 2.696603485561633, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8569658994674683, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8651107549667358, + "num_tokens": 808875261.0, + "step": 21198 + }, + { + "epoch": 2.6967306958402237, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8875741958618164, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8721466064453125, + "num_tokens": 808913044.0, + "step": 21199 + }, + { + "epoch": 2.6968579061188143, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9202277660369873, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8746687173843384, + "num_tokens": 808952374.0, + "step": 21200 + }, + { + "epoch": 2.696985116397405, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1127853393554688, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8667156100273132, + "num_tokens": 808994397.0, + "step": 21201 + }, + { + "epoch": 2.6971123266759953, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8094323873519897, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8688724637031555, + "num_tokens": 809036401.0, + "step": 21202 + }, + { + "epoch": 2.697239536954586, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.879449725151062, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8710495233535767, + "num_tokens": 809073073.0, + "step": 21203 + }, + { + "epoch": 2.6973667472331764, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 7.702436447143555, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8652442097663879, + "num_tokens": 809117348.0, + "step": 21204 + }, + { + "epoch": 2.697493957511767, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.144444704055786, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8787851333618164, + "num_tokens": 809151357.0, + "step": 21205 + }, + { + "epoch": 2.6976211677903574, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2793664932250977, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8606057167053223, + "num_tokens": 809189000.0, + "step": 21206 + }, + { + "epoch": 2.697748378068948, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9108036756515503, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8682907819747925, + "num_tokens": 809227380.0, + "step": 21207 + }, + { + "epoch": 2.6978755883475385, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.092519521713257, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8656457662582397, + "num_tokens": 809265131.0, + "step": 21208 + }, + { + "epoch": 2.698002798626129, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0410139560699463, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8653792142868042, + "num_tokens": 809297190.0, + "step": 21209 + }, + { + "epoch": 2.6981300089047195, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.893424153327942, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8748953342437744, + "num_tokens": 809336774.0, + "step": 21210 + }, + { + "epoch": 2.69825721918331, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0475029945373535, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8490018844604492, + "num_tokens": 809376612.0, + "step": 21211 + }, + { + "epoch": 2.6983844294619006, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7607381343841553, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8891239166259766, + "num_tokens": 809416058.0, + "step": 21212 + }, + { + "epoch": 2.698511639740491, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8615871667861938, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8778634071350098, + "num_tokens": 809449580.0, + "step": 21213 + }, + { + "epoch": 2.6986388500190817, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.895480751991272, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8682147860527039, + "num_tokens": 809484983.0, + "step": 21214 + }, + { + "epoch": 2.698766060297672, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8822062015533447, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8705288171768188, + "num_tokens": 809521086.0, + "step": 21215 + }, + { + "epoch": 2.6988932705762627, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7587730884552002, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.864566445350647, + "num_tokens": 809562475.0, + "step": 21216 + }, + { + "epoch": 2.699020480854853, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9676445722579956, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8529877066612244, + "num_tokens": 809596686.0, + "step": 21217 + }, + { + "epoch": 2.6991476911334438, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7807772159576416, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8682030439376831, + "num_tokens": 809636614.0, + "step": 21218 + }, + { + "epoch": 2.699274901412034, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9743727445602417, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8537837266921997, + "num_tokens": 809674542.0, + "step": 21219 + }, + { + "epoch": 2.699402111690625, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7505015134811401, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8840553760528564, + "num_tokens": 809716726.0, + "step": 21220 + }, + { + "epoch": 2.699529321969215, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.6055474281311035, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.875577449798584, + "num_tokens": 809756790.0, + "step": 21221 + }, + { + "epoch": 2.699656532247806, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.062741756439209, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8635702133178711, + "num_tokens": 809794616.0, + "step": 21222 + }, + { + "epoch": 2.699783742526396, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7156023979187012, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8731062412261963, + "num_tokens": 809836662.0, + "step": 21223 + }, + { + "epoch": 2.6999109528049865, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.025892496109009, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8627170920372009, + "num_tokens": 809873812.0, + "step": 21224 + }, + { + "epoch": 2.700038163083577, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.011857509613037, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8665728569030762, + "num_tokens": 809913139.0, + "step": 21225 + }, + { + "epoch": 2.7001653733621676, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2741143703460693, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8687440156936646, + "num_tokens": 809947741.0, + "step": 21226 + }, + { + "epoch": 2.700292583640758, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9141643047332764, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8578277826309204, + "num_tokens": 809987120.0, + "step": 21227 + }, + { + "epoch": 2.7004197939193486, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.951192021369934, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8589307069778442, + "num_tokens": 810026972.0, + "step": 21228 + }, + { + "epoch": 2.700547004197939, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.128279685974121, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8658714294433594, + "num_tokens": 810062767.0, + "step": 21229 + }, + { + "epoch": 2.7006742144765297, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0877795219421387, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8741436004638672, + "num_tokens": 810096930.0, + "step": 21230 + }, + { + "epoch": 2.70080142475512, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.981555461883545, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8521485328674316, + "num_tokens": 810134600.0, + "step": 21231 + }, + { + "epoch": 2.7009286350337107, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.847212791442871, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8692116141319275, + "num_tokens": 810169535.0, + "step": 21232 + }, + { + "epoch": 2.7010558453123013, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4454402923583984, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8550260663032532, + "num_tokens": 810205288.0, + "step": 21233 + }, + { + "epoch": 2.701183055590892, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9342048168182373, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8633233308792114, + "num_tokens": 810242868.0, + "step": 21234 + }, + { + "epoch": 2.7013102658694823, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8086574077606201, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8537487983703613, + "num_tokens": 810286539.0, + "step": 21235 + }, + { + "epoch": 2.701437476148073, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.782618522644043, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8687476515769958, + "num_tokens": 810331598.0, + "step": 21236 + }, + { + "epoch": 2.7015646864266634, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.712317705154419, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8742617964744568, + "num_tokens": 810369501.0, + "step": 21237 + }, + { + "epoch": 2.701691896705254, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7507432699203491, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8626336455345154, + "num_tokens": 810410727.0, + "step": 21238 + }, + { + "epoch": 2.7018191069838444, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.2067017555236816, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8575999736785889, + "num_tokens": 810450315.0, + "step": 21239 + }, + { + "epoch": 2.701946317262435, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.796517252922058, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8572571277618408, + "num_tokens": 810493972.0, + "step": 21240 + }, + { + "epoch": 2.7020735275410255, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9154266119003296, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8787784576416016, + "num_tokens": 810526038.0, + "step": 21241 + }, + { + "epoch": 2.7022007378196156, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0211424827575684, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8701058626174927, + "num_tokens": 810560772.0, + "step": 21242 + }, + { + "epoch": 2.7023279480982065, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9560319185256958, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8728538751602173, + "num_tokens": 810598322.0, + "step": 21243 + }, + { + "epoch": 2.7024551583767966, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7994729280471802, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8857898712158203, + "num_tokens": 810638652.0, + "step": 21244 + }, + { + "epoch": 2.7025823686553876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.842673897743225, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8643564581871033, + "num_tokens": 810680318.0, + "step": 21245 + }, + { + "epoch": 2.7027095789339777, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8929952383041382, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8693251013755798, + "num_tokens": 810715001.0, + "step": 21246 + }, + { + "epoch": 2.7028367892125686, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9633806943893433, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8571223020553589, + "num_tokens": 810748897.0, + "step": 21247 + }, + { + "epoch": 2.7029639994911587, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9440230131149292, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8607044219970703, + "num_tokens": 810788954.0, + "step": 21248 + }, + { + "epoch": 2.7030912097697493, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8930308818817139, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.869428277015686, + "num_tokens": 810828897.0, + "step": 21249 + }, + { + "epoch": 2.70321842004834, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8857245445251465, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8840714693069458, + "num_tokens": 810866698.0, + "step": 21250 + }, + { + "epoch": 2.7033456303269303, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0499918460845947, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8587939739227295, + "num_tokens": 810903140.0, + "step": 21251 + }, + { + "epoch": 2.703472840605521, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8918381929397583, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.869333028793335, + "num_tokens": 810940664.0, + "step": 21252 + }, + { + "epoch": 2.7036000508841114, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7671799659729004, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8631142377853394, + "num_tokens": 810981818.0, + "step": 21253 + }, + { + "epoch": 2.703727261162702, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8634121417999268, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8651407957077026, + "num_tokens": 811019897.0, + "step": 21254 + }, + { + "epoch": 2.7038544714412924, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9240872859954834, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8698247671127319, + "num_tokens": 811056310.0, + "step": 21255 + }, + { + "epoch": 2.703981681719883, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.09171724319458, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8558225631713867, + "num_tokens": 811091505.0, + "step": 21256 + }, + { + "epoch": 2.7041088919984735, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7562181949615479, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8786311149597168, + "num_tokens": 811134861.0, + "step": 21257 + }, + { + "epoch": 2.704236102277064, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9182950258255005, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8565268516540527, + "num_tokens": 811170001.0, + "step": 21258 + }, + { + "epoch": 2.7043633125556545, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0061872005462646, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.871040403842926, + "num_tokens": 811204203.0, + "step": 21259 + }, + { + "epoch": 2.704490522834245, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.807129144668579, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8691564798355103, + "num_tokens": 811244598.0, + "step": 21260 + }, + { + "epoch": 2.7046177331128356, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.854376196861267, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8773598670959473, + "num_tokens": 811283352.0, + "step": 21261 + }, + { + "epoch": 2.704744943391426, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9705756902694702, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8537845015525818, + "num_tokens": 811317605.0, + "step": 21262 + }, + { + "epoch": 2.7048721536700167, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.876894474029541, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8799718618392944, + "num_tokens": 811352117.0, + "step": 21263 + }, + { + "epoch": 2.704999363948607, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.031355381011963, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8597396016120911, + "num_tokens": 811386118.0, + "step": 21264 + }, + { + "epoch": 2.7051265742271977, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9575949907302856, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8753459453582764, + "num_tokens": 811424479.0, + "step": 21265 + }, + { + "epoch": 2.7052537845057882, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8537300825119019, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.853925883769989, + "num_tokens": 811464683.0, + "step": 21266 + }, + { + "epoch": 2.7053809947843783, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7118682861328125, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8818444013595581, + "num_tokens": 811504801.0, + "step": 21267 + }, + { + "epoch": 2.7055082050629693, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 82.19331359863281, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8575587272644043, + "num_tokens": 811546278.0, + "step": 21268 + }, + { + "epoch": 2.7056354153415594, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.018080234527588, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8524194359779358, + "num_tokens": 811587019.0, + "step": 21269 + }, + { + "epoch": 2.7057626256201504, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1148245334625244, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8424660563468933, + "num_tokens": 811619843.0, + "step": 21270 + }, + { + "epoch": 2.7058898358987404, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0319366455078125, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8709002137184143, + "num_tokens": 811655797.0, + "step": 21271 + }, + { + "epoch": 2.706017046177331, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8716763257980347, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8564653396606445, + "num_tokens": 811691626.0, + "step": 21272 + }, + { + "epoch": 2.7061442564559215, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9392656087875366, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8672034740447998, + "num_tokens": 811726046.0, + "step": 21273 + }, + { + "epoch": 2.706271466734512, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8772273063659668, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8562228679656982, + "num_tokens": 811765686.0, + "step": 21274 + }, + { + "epoch": 2.7063986770131025, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.00197696685791, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.873076856136322, + "num_tokens": 811801143.0, + "step": 21275 + }, + { + "epoch": 2.706525887291693, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9270790815353394, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8735541701316833, + "num_tokens": 811834807.0, + "step": 21276 + }, + { + "epoch": 2.7066530975702836, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.134769916534424, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8846235275268555, + "num_tokens": 811865676.0, + "step": 21277 + }, + { + "epoch": 2.706780307848874, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0550055503845215, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8623784780502319, + "num_tokens": 811901259.0, + "step": 21278 + }, + { + "epoch": 2.7069075181274647, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9202699661254883, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8510305285453796, + "num_tokens": 811937768.0, + "step": 21279 + }, + { + "epoch": 2.707034728406055, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0826141834259033, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.86549973487854, + "num_tokens": 811967805.0, + "step": 21280 + }, + { + "epoch": 2.7071619386846457, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0146868228912354, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8621670007705688, + "num_tokens": 812007468.0, + "step": 21281 + }, + { + "epoch": 2.7072891489632362, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 20.457645416259766, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.866981565952301, + "num_tokens": 812048127.0, + "step": 21282 + }, + { + "epoch": 2.7074163592418268, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9649455547332764, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8660125136375427, + "num_tokens": 812083904.0, + "step": 21283 + }, + { + "epoch": 2.7075435695204173, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8705122470855713, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8702752590179443, + "num_tokens": 812119643.0, + "step": 21284 + }, + { + "epoch": 2.707670779799008, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.912887454032898, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8640826940536499, + "num_tokens": 812155338.0, + "step": 21285 + }, + { + "epoch": 2.7077979900775984, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8760005235671997, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8656854629516602, + "num_tokens": 812193075.0, + "step": 21286 + }, + { + "epoch": 2.707925200356189, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9871563911437988, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8745418190956116, + "num_tokens": 812228163.0, + "step": 21287 + }, + { + "epoch": 2.7080524106347794, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9661661386489868, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8601994514465332, + "num_tokens": 812268565.0, + "step": 21288 + }, + { + "epoch": 2.70817962091337, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8712859153747559, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8709669709205627, + "num_tokens": 812307292.0, + "step": 21289 + }, + { + "epoch": 2.7083068311919605, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.902109146118164, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8762085437774658, + "num_tokens": 812346180.0, + "step": 21290 + }, + { + "epoch": 2.708434041470551, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9798648357391357, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8633019328117371, + "num_tokens": 812376033.0, + "step": 21291 + }, + { + "epoch": 2.708561251749141, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.940995454788208, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8841591477394104, + "num_tokens": 812407925.0, + "step": 21292 + }, + { + "epoch": 2.708688462027732, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8767380714416504, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8713903427124023, + "num_tokens": 812452354.0, + "step": 21293 + }, + { + "epoch": 2.708815672306322, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.036769151687622, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8642606735229492, + "num_tokens": 812490015.0, + "step": 21294 + }, + { + "epoch": 2.708942882584913, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8236637115478516, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8631235957145691, + "num_tokens": 812529919.0, + "step": 21295 + }, + { + "epoch": 2.709070092863503, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8450366258621216, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8783804178237915, + "num_tokens": 812568541.0, + "step": 21296 + }, + { + "epoch": 2.7091973031420937, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.771319627761841, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8573253154754639, + "num_tokens": 812612098.0, + "step": 21297 + }, + { + "epoch": 2.7093245134206843, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0070180892944336, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.862006664276123, + "num_tokens": 812648843.0, + "step": 21298 + }, + { + "epoch": 2.709451723699275, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0215916633605957, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8671213984489441, + "num_tokens": 812692080.0, + "step": 21299 + }, + { + "epoch": 2.7095789339778653, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8092710971832275, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8790931701660156, + "num_tokens": 812729441.0, + "step": 21300 + }, + { + "epoch": 2.709706144256456, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8496235609054565, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.85979163646698, + "num_tokens": 812768908.0, + "step": 21301 + }, + { + "epoch": 2.7098333545350464, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.829580545425415, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.864888072013855, + "num_tokens": 812808384.0, + "step": 21302 + }, + { + "epoch": 2.709960564813637, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.846479892730713, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8585758805274963, + "num_tokens": 812847658.0, + "step": 21303 + }, + { + "epoch": 2.7100877750922274, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7884727716445923, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8665074110031128, + "num_tokens": 812889394.0, + "step": 21304 + }, + { + "epoch": 2.710214985370818, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0030264854431152, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8520745038986206, + "num_tokens": 812924202.0, + "step": 21305 + }, + { + "epoch": 2.7103421956494085, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9678510427474976, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8849354982376099, + "num_tokens": 812954391.0, + "step": 21306 + }, + { + "epoch": 2.710469405927999, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1114888191223145, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8561733961105347, + "num_tokens": 812989498.0, + "step": 21307 + }, + { + "epoch": 2.7105966162065895, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9029324054718018, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8728870153427124, + "num_tokens": 813030968.0, + "step": 21308 + }, + { + "epoch": 2.71072382648518, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.793851375579834, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8895591497421265, + "num_tokens": 813065843.0, + "step": 21309 + }, + { + "epoch": 2.7108510367637706, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.259042501449585, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8715544939041138, + "num_tokens": 813107410.0, + "step": 21310 + }, + { + "epoch": 2.710978247042361, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6333532333374023, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8733954429626465, + "num_tokens": 813155959.0, + "step": 21311 + }, + { + "epoch": 2.7111054573209517, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8705605268478394, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8724130392074585, + "num_tokens": 813194133.0, + "step": 21312 + }, + { + "epoch": 2.711232667599542, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.981443166732788, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8545810580253601, + "num_tokens": 813231015.0, + "step": 21313 + }, + { + "epoch": 2.7113598778781327, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.759684681892395, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.881719708442688, + "num_tokens": 813271497.0, + "step": 21314 + }, + { + "epoch": 2.711487088156723, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9613765478134155, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8666764497756958, + "num_tokens": 813305758.0, + "step": 21315 + }, + { + "epoch": 2.7116142984353138, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.280905246734619, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8643941283226013, + "num_tokens": 813345958.0, + "step": 21316 + }, + { + "epoch": 2.711741508713904, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9083428382873535, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8702396154403687, + "num_tokens": 813381630.0, + "step": 21317 + }, + { + "epoch": 2.711868718992495, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.010601758956909, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8657829761505127, + "num_tokens": 813421320.0, + "step": 21318 + }, + { + "epoch": 2.711995929271085, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.832942008972168, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8616269826889038, + "num_tokens": 813458323.0, + "step": 21319 + }, + { + "epoch": 2.712123139549676, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9385321140289307, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8476581573486328, + "num_tokens": 813501436.0, + "step": 21320 + }, + { + "epoch": 2.712250349828266, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7968740463256836, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8672380447387695, + "num_tokens": 813545942.0, + "step": 21321 + }, + { + "epoch": 2.7123775601068565, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8535425662994385, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8604869246482849, + "num_tokens": 813582202.0, + "step": 21322 + }, + { + "epoch": 2.712504770385447, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8398083448410034, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8513901829719543, + "num_tokens": 813621711.0, + "step": 21323 + }, + { + "epoch": 2.7126319806640375, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8854352235794067, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8607933521270752, + "num_tokens": 813665404.0, + "step": 21324 + }, + { + "epoch": 2.712759190942628, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.972286581993103, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8732403516769409, + "num_tokens": 813697470.0, + "step": 21325 + }, + { + "epoch": 2.7128864012212186, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8810218572616577, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8747179508209229, + "num_tokens": 813733308.0, + "step": 21326 + }, + { + "epoch": 2.713013611499809, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8882392644882202, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8538244366645813, + "num_tokens": 813770881.0, + "step": 21327 + }, + { + "epoch": 2.7131408217783997, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9131946563720703, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8710366487503052, + "num_tokens": 813806495.0, + "step": 21328 + }, + { + "epoch": 2.71326803205699, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.01596736907959, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8712203502655029, + "num_tokens": 813842408.0, + "step": 21329 + }, + { + "epoch": 2.7133952423355807, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9225760698318481, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8624244332313538, + "num_tokens": 813883041.0, + "step": 21330 + }, + { + "epoch": 2.7135224526141712, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4746387004852295, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.862060010433197, + "num_tokens": 813922901.0, + "step": 21331 + }, + { + "epoch": 2.7136496628927618, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8310186862945557, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8685958385467529, + "num_tokens": 813961948.0, + "step": 21332 + }, + { + "epoch": 2.7137768731713523, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.992499589920044, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.881256103515625, + "num_tokens": 813999194.0, + "step": 21333 + }, + { + "epoch": 2.713904083449943, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9248510599136353, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8562930822372437, + "num_tokens": 814038986.0, + "step": 21334 + }, + { + "epoch": 2.7140312937285334, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8080275058746338, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8816468715667725, + "num_tokens": 814074351.0, + "step": 21335 + }, + { + "epoch": 2.714158504007124, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9276559352874756, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8669689297676086, + "num_tokens": 814109255.0, + "step": 21336 + }, + { + "epoch": 2.7142857142857144, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9038054943084717, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8618505001068115, + "num_tokens": 814146601.0, + "step": 21337 + }, + { + "epoch": 2.714412924564305, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.044710636138916, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8722919821739197, + "num_tokens": 814181299.0, + "step": 21338 + }, + { + "epoch": 2.7145401348428955, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8155204057693481, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8756834268569946, + "num_tokens": 814218178.0, + "step": 21339 + }, + { + "epoch": 2.7146673451214856, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9091235399246216, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8688993453979492, + "num_tokens": 814254161.0, + "step": 21340 + }, + { + "epoch": 2.7147945554000765, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0850718021392822, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8656914234161377, + "num_tokens": 814290164.0, + "step": 21341 + }, + { + "epoch": 2.7149217656786666, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.855665922164917, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8459587693214417, + "num_tokens": 814334459.0, + "step": 21342 + }, + { + "epoch": 2.7150489759572576, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.050314426422119, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8583773970603943, + "num_tokens": 814374291.0, + "step": 21343 + }, + { + "epoch": 2.7151761862358477, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8687934875488281, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8630881905555725, + "num_tokens": 814413196.0, + "step": 21344 + }, + { + "epoch": 2.7153033965144386, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0605807304382324, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8640774488449097, + "num_tokens": 814455312.0, + "step": 21345 + }, + { + "epoch": 2.7154306067930287, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9814964532852173, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8625593185424805, + "num_tokens": 814496586.0, + "step": 21346 + }, + { + "epoch": 2.7155578170716193, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.871949315071106, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.873224675655365, + "num_tokens": 814531566.0, + "step": 21347 + }, + { + "epoch": 2.71568502735021, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.062798261642456, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8736057281494141, + "num_tokens": 814566168.0, + "step": 21348 + }, + { + "epoch": 2.7158122376288003, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0459232330322266, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8667559623718262, + "num_tokens": 814604927.0, + "step": 21349 + }, + { + "epoch": 2.715939447907391, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8419058322906494, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8802860975265503, + "num_tokens": 814641401.0, + "step": 21350 + }, + { + "epoch": 2.7160666581859814, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0293405055999756, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8725981712341309, + "num_tokens": 814674252.0, + "step": 21351 + }, + { + "epoch": 2.716193868464572, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9217405319213867, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8661314845085144, + "num_tokens": 814704965.0, + "step": 21352 + }, + { + "epoch": 2.7163210787431624, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.91853928565979, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8380516767501831, + "num_tokens": 814743740.0, + "step": 21353 + }, + { + "epoch": 2.716448289021753, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8884989023208618, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8731613755226135, + "num_tokens": 814784598.0, + "step": 21354 + }, + { + "epoch": 2.7165754993003435, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8820582628250122, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8608880043029785, + "num_tokens": 814821437.0, + "step": 21355 + }, + { + "epoch": 2.716702709578934, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7909013032913208, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8651573657989502, + "num_tokens": 814861638.0, + "step": 21356 + }, + { + "epoch": 2.7168299198575245, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9627037048339844, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.867830216884613, + "num_tokens": 814893799.0, + "step": 21357 + }, + { + "epoch": 2.716957130136115, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8296347856521606, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8735659718513489, + "num_tokens": 814930726.0, + "step": 21358 + }, + { + "epoch": 2.7170843404147056, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9332181215286255, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8657858967781067, + "num_tokens": 814970041.0, + "step": 21359 + }, + { + "epoch": 2.717211550693296, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.130587577819824, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8689583539962769, + "num_tokens": 815008587.0, + "step": 21360 + }, + { + "epoch": 2.7173387609718866, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.235607862472534, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.852354109287262, + "num_tokens": 815047659.0, + "step": 21361 + }, + { + "epoch": 2.717465971250477, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9758453369140625, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8654406070709229, + "num_tokens": 815082631.0, + "step": 21362 + }, + { + "epoch": 2.7175931815290677, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8214046955108643, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8849936127662659, + "num_tokens": 815117568.0, + "step": 21363 + }, + { + "epoch": 2.7177203918076582, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8629623651504517, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8666560053825378, + "num_tokens": 815155019.0, + "step": 21364 + }, + { + "epoch": 2.7178476020862483, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.829107403755188, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8667306900024414, + "num_tokens": 815195386.0, + "step": 21365 + }, + { + "epoch": 2.7179748123648393, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0351336002349854, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8707736730575562, + "num_tokens": 815228304.0, + "step": 21366 + }, + { + "epoch": 2.7181020226434294, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.965899109840393, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8850946426391602, + "num_tokens": 815260111.0, + "step": 21367 + }, + { + "epoch": 2.7182292329220203, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9612091779708862, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8650293350219727, + "num_tokens": 815297272.0, + "step": 21368 + }, + { + "epoch": 2.7183564432006104, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9623838663101196, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8650395274162292, + "num_tokens": 815336016.0, + "step": 21369 + }, + { + "epoch": 2.718483653479201, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0215320587158203, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8391342759132385, + "num_tokens": 815373940.0, + "step": 21370 + }, + { + "epoch": 2.7186108637577915, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0936901569366455, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8548375964164734, + "num_tokens": 815410836.0, + "step": 21371 + }, + { + "epoch": 2.718738074036382, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1990373134613037, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8711128234863281, + "num_tokens": 815449577.0, + "step": 21372 + }, + { + "epoch": 2.7188652843149725, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1972951889038086, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8684759736061096, + "num_tokens": 815485869.0, + "step": 21373 + }, + { + "epoch": 2.718992494593563, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0246012210845947, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8463160991668701, + "num_tokens": 815523614.0, + "step": 21374 + }, + { + "epoch": 2.7191197048721536, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0128886699676514, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8447019457817078, + "num_tokens": 815561835.0, + "step": 21375 + }, + { + "epoch": 2.719246915150744, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7741212844848633, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8496330976486206, + "num_tokens": 815605596.0, + "step": 21376 + }, + { + "epoch": 2.7193741254293347, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0707263946533203, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8601034283638, + "num_tokens": 815638720.0, + "step": 21377 + }, + { + "epoch": 2.719501335707925, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.816472053527832, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.881165623664856, + "num_tokens": 815673996.0, + "step": 21378 + }, + { + "epoch": 2.7196285459865157, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0219595432281494, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8530937433242798, + "num_tokens": 815716571.0, + "step": 21379 + }, + { + "epoch": 2.7197557562651062, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.99907386302948, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8724591135978699, + "num_tokens": 815748973.0, + "step": 21380 + }, + { + "epoch": 2.7198829665436968, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8891935348510742, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8685315847396851, + "num_tokens": 815794754.0, + "step": 21381 + }, + { + "epoch": 2.7200101768222873, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8554784059524536, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8675046563148499, + "num_tokens": 815831051.0, + "step": 21382 + }, + { + "epoch": 2.720137387100878, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7002086639404297, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8669623136520386, + "num_tokens": 815874291.0, + "step": 21383 + }, + { + "epoch": 2.7202645973794684, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1302897930145264, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8622532486915588, + "num_tokens": 815912484.0, + "step": 21384 + }, + { + "epoch": 2.720391807658059, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9973286390304565, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8506597876548767, + "num_tokens": 815948626.0, + "step": 21385 + }, + { + "epoch": 2.7205190179366494, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.051236629486084, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8763258457183838, + "num_tokens": 815983154.0, + "step": 21386 + }, + { + "epoch": 2.72064622821524, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9844274520874023, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8690364360809326, + "num_tokens": 816017524.0, + "step": 21387 + }, + { + "epoch": 2.7207734384938305, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8392518758773804, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.861614465713501, + "num_tokens": 816055204.0, + "step": 21388 + }, + { + "epoch": 2.720900648772421, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.329049825668335, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8749075531959534, + "num_tokens": 816086671.0, + "step": 21389 + }, + { + "epoch": 2.721027859051011, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4163851737976074, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.846290111541748, + "num_tokens": 816114173.0, + "step": 21390 + }, + { + "epoch": 2.721155069329602, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.774657130241394, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8777142763137817, + "num_tokens": 816156434.0, + "step": 21391 + }, + { + "epoch": 2.721282279608192, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9691083431243896, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.860588550567627, + "num_tokens": 816191037.0, + "step": 21392 + }, + { + "epoch": 2.721409489886783, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.094215154647827, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8798831105232239, + "num_tokens": 816227031.0, + "step": 21393 + }, + { + "epoch": 2.721536700165373, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8354718685150146, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8704485893249512, + "num_tokens": 816264308.0, + "step": 21394 + }, + { + "epoch": 2.7216639104439637, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.072345495223999, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8631433844566345, + "num_tokens": 816300955.0, + "step": 21395 + }, + { + "epoch": 2.7217911207225542, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0821988582611084, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8654841184616089, + "num_tokens": 816332881.0, + "step": 21396 + }, + { + "epoch": 2.7219183310011448, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8672840595245361, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8857928514480591, + "num_tokens": 816367446.0, + "step": 21397 + }, + { + "epoch": 2.7220455412797353, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0631861686706543, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8492849469184875, + "num_tokens": 816407286.0, + "step": 21398 + }, + { + "epoch": 2.722172751558326, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.80457603931427, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8762680292129517, + "num_tokens": 816445277.0, + "step": 21399 + }, + { + "epoch": 2.7222999618369164, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8523732423782349, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8731908202171326, + "num_tokens": 816481638.0, + "step": 21400 + }, + { + "epoch": 2.722427172115507, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9823546409606934, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8683571219444275, + "num_tokens": 816521386.0, + "step": 21401 + }, + { + "epoch": 2.7225543823940974, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9612956047058105, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8783203363418579, + "num_tokens": 816554760.0, + "step": 21402 + }, + { + "epoch": 2.722681592672688, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7992833852767944, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8744626045227051, + "num_tokens": 816595195.0, + "step": 21403 + }, + { + "epoch": 2.7228088029512785, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7078206539154053, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8687540292739868, + "num_tokens": 816633046.0, + "step": 21404 + }, + { + "epoch": 2.722936013229869, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8536276817321777, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8807870149612427, + "num_tokens": 816675283.0, + "step": 21405 + }, + { + "epoch": 2.7230632235084595, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7586928606033325, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8582465648651123, + "num_tokens": 816716499.0, + "step": 21406 + }, + { + "epoch": 2.72319043378705, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7698798179626465, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8605549931526184, + "num_tokens": 816761825.0, + "step": 21407 + }, + { + "epoch": 2.7233176440656406, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8831392526626587, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8740403652191162, + "num_tokens": 816792080.0, + "step": 21408 + }, + { + "epoch": 2.723444854344231, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9937818050384521, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8563623428344727, + "num_tokens": 816832121.0, + "step": 21409 + }, + { + "epoch": 2.7235720646228216, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.888525366783142, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8590432405471802, + "num_tokens": 816874032.0, + "step": 21410 + }, + { + "epoch": 2.723699274901412, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.002241611480713, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8479903936386108, + "num_tokens": 816920471.0, + "step": 21411 + }, + { + "epoch": 2.7238264851800027, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8991779088974, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8676072359085083, + "num_tokens": 816958042.0, + "step": 21412 + }, + { + "epoch": 2.723953695458593, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.040109395980835, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8574939370155334, + "num_tokens": 816992158.0, + "step": 21413 + }, + { + "epoch": 2.7240809057371838, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.059154748916626, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8544631004333496, + "num_tokens": 817032552.0, + "step": 21414 + }, + { + "epoch": 2.724208116015774, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2517781257629395, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8643860816955566, + "num_tokens": 817072240.0, + "step": 21415 + }, + { + "epoch": 2.724335326294365, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1475014686584473, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8565848469734192, + "num_tokens": 817104741.0, + "step": 21416 + }, + { + "epoch": 2.724462536572955, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8509005308151245, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8536951541900635, + "num_tokens": 817143876.0, + "step": 21417 + }, + { + "epoch": 2.724589746851546, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.899459958076477, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8889628052711487, + "num_tokens": 817183647.0, + "step": 21418 + }, + { + "epoch": 2.724716957130136, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9847685098648071, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.860873818397522, + "num_tokens": 817214520.0, + "step": 21419 + }, + { + "epoch": 2.7248441674087265, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0498292446136475, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8561795353889465, + "num_tokens": 817256936.0, + "step": 21420 + }, + { + "epoch": 2.724971377687317, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8537858724594116, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8682111501693726, + "num_tokens": 817292840.0, + "step": 21421 + }, + { + "epoch": 2.7250985879659075, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9210275411605835, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8617085218429565, + "num_tokens": 817332872.0, + "step": 21422 + }, + { + "epoch": 2.725225798244498, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.020054817199707, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8737341165542603, + "num_tokens": 817367668.0, + "step": 21423 + }, + { + "epoch": 2.7253530085230886, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7132925987243652, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8848021030426025, + "num_tokens": 817407726.0, + "step": 21424 + }, + { + "epoch": 2.725480218801679, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.233487844467163, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8765137195587158, + "num_tokens": 817441003.0, + "step": 21425 + }, + { + "epoch": 2.7256074290802697, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.813400387763977, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.872452974319458, + "num_tokens": 817482587.0, + "step": 21426 + }, + { + "epoch": 2.72573463935886, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9921391010284424, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8704463839530945, + "num_tokens": 817519727.0, + "step": 21427 + }, + { + "epoch": 2.7258618496374507, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.930344820022583, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8627781867980957, + "num_tokens": 817553809.0, + "step": 21428 + }, + { + "epoch": 2.7259890599160412, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9068191051483154, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8504443168640137, + "num_tokens": 817588893.0, + "step": 21429 + }, + { + "epoch": 2.7261162701946318, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9049549102783203, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8737148642539978, + "num_tokens": 817627789.0, + "step": 21430 + }, + { + "epoch": 2.7262434804732223, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1854937076568604, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8560348749160767, + "num_tokens": 817670048.0, + "step": 21431 + }, + { + "epoch": 2.726370690751813, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9133775234222412, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8708760738372803, + "num_tokens": 817707108.0, + "step": 21432 + }, + { + "epoch": 2.7264979010304033, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7560769319534302, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8635751008987427, + "num_tokens": 817748126.0, + "step": 21433 + }, + { + "epoch": 2.726625111308994, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.076397180557251, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8684645891189575, + "num_tokens": 817781578.0, + "step": 21434 + }, + { + "epoch": 2.7267523215875844, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8690176010131836, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8721303939819336, + "num_tokens": 817822143.0, + "step": 21435 + }, + { + "epoch": 2.726879531866175, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8355456590652466, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8769760727882385, + "num_tokens": 817858456.0, + "step": 21436 + }, + { + "epoch": 2.7270067421447655, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8181692361831665, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8746225833892822, + "num_tokens": 817892863.0, + "step": 21437 + }, + { + "epoch": 2.7271339524233555, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.895282506942749, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8702420592308044, + "num_tokens": 817931120.0, + "step": 21438 + }, + { + "epoch": 2.7272611627019465, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.08447003364563, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8673968315124512, + "num_tokens": 817963076.0, + "step": 21439 + }, + { + "epoch": 2.7273883729805366, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8367249965667725, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8667260408401489, + "num_tokens": 818003844.0, + "step": 21440 + }, + { + "epoch": 2.7275155832591276, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6748247146606445, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8690357208251953, + "num_tokens": 818048533.0, + "step": 21441 + }, + { + "epoch": 2.7276427935377177, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8874326944351196, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8641669750213623, + "num_tokens": 818083290.0, + "step": 21442 + }, + { + "epoch": 2.7277700038163086, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.821091890335083, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8761708736419678, + "num_tokens": 818120037.0, + "step": 21443 + }, + { + "epoch": 2.7278972140948987, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7549738883972168, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8663958311080933, + "num_tokens": 818164122.0, + "step": 21444 + }, + { + "epoch": 2.7280244243734892, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.981439471244812, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8691434860229492, + "num_tokens": 818198065.0, + "step": 21445 + }, + { + "epoch": 2.7281516346520798, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.941117286682129, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8681368231773376, + "num_tokens": 818236843.0, + "step": 21446 + }, + { + "epoch": 2.7282788449306703, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.861417293548584, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8686153888702393, + "num_tokens": 818271300.0, + "step": 21447 + }, + { + "epoch": 2.728406055209261, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.784219741821289, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.851906955242157, + "num_tokens": 818311799.0, + "step": 21448 + }, + { + "epoch": 2.7285332654878514, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9011949300765991, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8726979494094849, + "num_tokens": 818346662.0, + "step": 21449 + }, + { + "epoch": 2.728660475766442, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8834372758865356, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8714160323143005, + "num_tokens": 818384465.0, + "step": 21450 + }, + { + "epoch": 2.7287876860450324, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8666672706604004, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8637975454330444, + "num_tokens": 818425625.0, + "step": 21451 + }, + { + "epoch": 2.728914896323623, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0001258850097656, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8725814819335938, + "num_tokens": 818458544.0, + "step": 21452 + }, + { + "epoch": 2.7290421066022135, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9832096099853516, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8618846535682678, + "num_tokens": 818503507.0, + "step": 21453 + }, + { + "epoch": 2.729169316880804, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8061753511428833, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8729212284088135, + "num_tokens": 818545411.0, + "step": 21454 + }, + { + "epoch": 2.7292965271593945, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9838334321975708, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8634527325630188, + "num_tokens": 818582100.0, + "step": 21455 + }, + { + "epoch": 2.729423737437985, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8867733478546143, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8638746738433838, + "num_tokens": 818615549.0, + "step": 21456 + }, + { + "epoch": 2.7295509477165756, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0941503047943115, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8717291951179504, + "num_tokens": 818652476.0, + "step": 21457 + }, + { + "epoch": 2.729678157995166, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9647352695465088, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8702849745750427, + "num_tokens": 818692435.0, + "step": 21458 + }, + { + "epoch": 2.7298053682737566, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.783901333808899, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8661403656005859, + "num_tokens": 818733206.0, + "step": 21459 + }, + { + "epoch": 2.729932578552347, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9196254014968872, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8687151670455933, + "num_tokens": 818771717.0, + "step": 21460 + }, + { + "epoch": 2.7300597888309377, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.856146216392517, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8653366565704346, + "num_tokens": 818806999.0, + "step": 21461 + }, + { + "epoch": 2.7301869991095282, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8104966878890991, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8729827404022217, + "num_tokens": 818846226.0, + "step": 21462 + }, + { + "epoch": 2.7303142093881183, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.025095224380493, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.871761679649353, + "num_tokens": 818885667.0, + "step": 21463 + }, + { + "epoch": 2.7304414196667093, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0005438327789307, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8738027215003967, + "num_tokens": 818924009.0, + "step": 21464 + }, + { + "epoch": 2.7305686299452994, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.789493441581726, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8732507228851318, + "num_tokens": 818962614.0, + "step": 21465 + }, + { + "epoch": 2.7306958402238903, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.926151990890503, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8687121272087097, + "num_tokens": 819004486.0, + "step": 21466 + }, + { + "epoch": 2.7308230505024804, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8884797096252441, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8598203659057617, + "num_tokens": 819048364.0, + "step": 21467 + }, + { + "epoch": 2.730950260781071, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9317020177841187, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8629543781280518, + "num_tokens": 819086250.0, + "step": 21468 + }, + { + "epoch": 2.7310774710596615, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7229421138763428, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8783189654350281, + "num_tokens": 819126705.0, + "step": 21469 + }, + { + "epoch": 2.731204681338252, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9810808897018433, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.868046760559082, + "num_tokens": 819165967.0, + "step": 21470 + }, + { + "epoch": 2.7313318916168425, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7796316146850586, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8638577461242676, + "num_tokens": 819208066.0, + "step": 21471 + }, + { + "epoch": 2.731459101895433, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0653841495513916, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8651657104492188, + "num_tokens": 819246269.0, + "step": 21472 + }, + { + "epoch": 2.7315863121740236, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8485102653503418, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.868536651134491, + "num_tokens": 819286027.0, + "step": 21473 + }, + { + "epoch": 2.731713522452614, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9219107627868652, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8752197027206421, + "num_tokens": 819319994.0, + "step": 21474 + }, + { + "epoch": 2.7318407327312046, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.790784239768982, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8624870777130127, + "num_tokens": 819360113.0, + "step": 21475 + }, + { + "epoch": 2.731967943009795, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0570030212402344, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8738825917243958, + "num_tokens": 819394575.0, + "step": 21476 + }, + { + "epoch": 2.7320951532883857, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.828754186630249, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8604677319526672, + "num_tokens": 819434545.0, + "step": 21477 + }, + { + "epoch": 2.7322223635669762, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9716564416885376, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8754302263259888, + "num_tokens": 819470699.0, + "step": 21478 + }, + { + "epoch": 2.7323495738455668, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8333498239517212, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8614012002944946, + "num_tokens": 819514447.0, + "step": 21479 + }, + { + "epoch": 2.7324767841241573, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9402693510055542, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8613141775131226, + "num_tokens": 819553434.0, + "step": 21480 + }, + { + "epoch": 2.732603994402748, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0001797676086426, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8585524559020996, + "num_tokens": 819591594.0, + "step": 21481 + }, + { + "epoch": 2.7327312046813383, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.954689860343933, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8632371425628662, + "num_tokens": 819634530.0, + "step": 21482 + }, + { + "epoch": 2.732858414959929, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9790141582489014, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8577539920806885, + "num_tokens": 819669358.0, + "step": 21483 + }, + { + "epoch": 2.7329856252385194, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.670990467071533, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8649536371231079, + "num_tokens": 819711336.0, + "step": 21484 + }, + { + "epoch": 2.73311283551711, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9422987699508667, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8560309410095215, + "num_tokens": 819746973.0, + "step": 21485 + }, + { + "epoch": 2.7332400457957005, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9769123792648315, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8757478594779968, + "num_tokens": 819780305.0, + "step": 21486 + }, + { + "epoch": 2.733367256074291, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8496071100234985, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8748737573623657, + "num_tokens": 819817326.0, + "step": 21487 + }, + { + "epoch": 2.733494466352881, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8838610649108887, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8607797622680664, + "num_tokens": 819857597.0, + "step": 21488 + }, + { + "epoch": 2.733621676631472, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8556863069534302, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8669849634170532, + "num_tokens": 819893471.0, + "step": 21489 + }, + { + "epoch": 2.733748886910062, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0174050331115723, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8488154411315918, + "num_tokens": 819934175.0, + "step": 21490 + }, + { + "epoch": 2.733876097188653, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.972078561782837, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8747347593307495, + "num_tokens": 819971451.0, + "step": 21491 + }, + { + "epoch": 2.734003307467243, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.820379376411438, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8628169298171997, + "num_tokens": 820011566.0, + "step": 21492 + }, + { + "epoch": 2.7341305177458337, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.946894884109497, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8424860835075378, + "num_tokens": 820047944.0, + "step": 21493 + }, + { + "epoch": 2.7342577280244242, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.149949312210083, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8536196947097778, + "num_tokens": 820083416.0, + "step": 21494 + }, + { + "epoch": 2.7343849383030148, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8413370847702026, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.858400285243988, + "num_tokens": 820122476.0, + "step": 21495 + }, + { + "epoch": 2.7345121485816053, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.872463583946228, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8710259199142456, + "num_tokens": 820159866.0, + "step": 21496 + }, + { + "epoch": 2.734639358860196, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7866809368133545, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8669177293777466, + "num_tokens": 820202710.0, + "step": 21497 + }, + { + "epoch": 2.7347665691387864, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9640957117080688, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8644073009490967, + "num_tokens": 820239156.0, + "step": 21498 + }, + { + "epoch": 2.734893779417377, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9743350744247437, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8842897415161133, + "num_tokens": 820274010.0, + "step": 21499 + }, + { + "epoch": 2.7350209896959674, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8179116249084473, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8649497628211975, + "num_tokens": 820319171.0, + "step": 21500 + }, + { + "epoch": 2.735148199974558, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9589298963546753, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8516312837600708, + "num_tokens": 820355038.0, + "step": 21501 + }, + { + "epoch": 2.7352754102531485, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9705631732940674, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.862165093421936, + "num_tokens": 820390973.0, + "step": 21502 + }, + { + "epoch": 2.735402620531739, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.95534086227417, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8596251606941223, + "num_tokens": 820424543.0, + "step": 21503 + }, + { + "epoch": 2.7355298308103295, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.7287027835845947, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8660508990287781, + "num_tokens": 820462702.0, + "step": 21504 + }, + { + "epoch": 2.73565704108892, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.047494411468506, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8756390810012817, + "num_tokens": 820498840.0, + "step": 21505 + }, + { + "epoch": 2.7357842513675106, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3267734050750732, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8603646755218506, + "num_tokens": 820534083.0, + "step": 21506 + }, + { + "epoch": 2.735911461646101, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8338611125946045, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8690779805183411, + "num_tokens": 820572479.0, + "step": 21507 + }, + { + "epoch": 2.7360386719246916, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.854540467262268, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.853411853313446, + "num_tokens": 820610283.0, + "step": 21508 + }, + { + "epoch": 2.736165882203282, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.80625581741333, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.882763147354126, + "num_tokens": 820647580.0, + "step": 21509 + }, + { + "epoch": 2.7362930924818727, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9067981243133545, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8722066283226013, + "num_tokens": 820686774.0, + "step": 21510 + }, + { + "epoch": 2.7364203027604628, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.845522403717041, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8751593828201294, + "num_tokens": 820724635.0, + "step": 21511 + }, + { + "epoch": 2.7365475130390537, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8789061307907104, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8750808238983154, + "num_tokens": 820761467.0, + "step": 21512 + }, + { + "epoch": 2.736674723317644, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9769461154937744, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8521528244018555, + "num_tokens": 820801001.0, + "step": 21513 + }, + { + "epoch": 2.736801933596235, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8368206024169922, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8601462841033936, + "num_tokens": 820841028.0, + "step": 21514 + }, + { + "epoch": 2.736929143874825, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8381351232528687, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.874442458152771, + "num_tokens": 820881613.0, + "step": 21515 + }, + { + "epoch": 2.737056354153416, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.7644717693328857, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8742563724517822, + "num_tokens": 820918205.0, + "step": 21516 + }, + { + "epoch": 2.737183564432006, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.153698444366455, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8592060804367065, + "num_tokens": 820950751.0, + "step": 21517 + }, + { + "epoch": 2.7373107747105965, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9033253192901611, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8680822849273682, + "num_tokens": 820987385.0, + "step": 21518 + }, + { + "epoch": 2.737437984989187, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8598285913467407, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8599438667297363, + "num_tokens": 821026273.0, + "step": 21519 + }, + { + "epoch": 2.7375651952677775, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8497825860977173, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8585512638092041, + "num_tokens": 821067978.0, + "step": 21520 + }, + { + "epoch": 2.737692405546368, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7958463430404663, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8710891008377075, + "num_tokens": 821111925.0, + "step": 21521 + }, + { + "epoch": 2.7378196158249586, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.6892271041870117, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8691325783729553, + "num_tokens": 821151019.0, + "step": 21522 + }, + { + "epoch": 2.737946826103549, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.110625743865967, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.873591423034668, + "num_tokens": 821185520.0, + "step": 21523 + }, + { + "epoch": 2.7380740363821396, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.859660029411316, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8675062656402588, + "num_tokens": 821224273.0, + "step": 21524 + }, + { + "epoch": 2.73820124666073, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.899113655090332, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.873244047164917, + "num_tokens": 821263541.0, + "step": 21525 + }, + { + "epoch": 2.7383284569393207, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7898331880569458, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8556442260742188, + "num_tokens": 821304053.0, + "step": 21526 + }, + { + "epoch": 2.7384556672179112, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9904696941375732, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8659989833831787, + "num_tokens": 821339710.0, + "step": 21527 + }, + { + "epoch": 2.7385828774965018, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3219733238220215, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8660365343093872, + "num_tokens": 821378377.0, + "step": 21528 + }, + { + "epoch": 2.7387100877750923, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.920451283454895, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8754274845123291, + "num_tokens": 821417434.0, + "step": 21529 + }, + { + "epoch": 2.738837298053683, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.884225606918335, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8629117608070374, + "num_tokens": 821464323.0, + "step": 21530 + }, + { + "epoch": 2.7389645083322733, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7942625284194946, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8637402057647705, + "num_tokens": 821509891.0, + "step": 21531 + }, + { + "epoch": 2.739091718610864, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8167086839675903, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8555095791816711, + "num_tokens": 821557309.0, + "step": 21532 + }, + { + "epoch": 2.7392189288894544, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8350433111190796, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.862393856048584, + "num_tokens": 821597751.0, + "step": 21533 + }, + { + "epoch": 2.739346139168045, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6927552223205566, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8790972232818604, + "num_tokens": 821637961.0, + "step": 21534 + }, + { + "epoch": 2.7394733494466355, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7298526763916016, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8499487638473511, + "num_tokens": 821684728.0, + "step": 21535 + }, + { + "epoch": 2.7396005597252255, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9991649389266968, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8717514276504517, + "num_tokens": 821715359.0, + "step": 21536 + }, + { + "epoch": 2.7397277700038165, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8311446905136108, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8607671856880188, + "num_tokens": 821755540.0, + "step": 21537 + }, + { + "epoch": 2.7398549802824066, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.939792275428772, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8661248683929443, + "num_tokens": 821788506.0, + "step": 21538 + }, + { + "epoch": 2.7399821905609976, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9296129941940308, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8643474578857422, + "num_tokens": 821826555.0, + "step": 21539 + }, + { + "epoch": 2.7401094008395877, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8891973495483398, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8549182415008545, + "num_tokens": 821866160.0, + "step": 21540 + }, + { + "epoch": 2.7402366111181786, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9465348720550537, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8722478151321411, + "num_tokens": 821905595.0, + "step": 21541 + }, + { + "epoch": 2.7403638213967687, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9041776657104492, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8570722341537476, + "num_tokens": 821944769.0, + "step": 21542 + }, + { + "epoch": 2.7404910316753592, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6418012380599976, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8735228776931763, + "num_tokens": 821988889.0, + "step": 21543 + }, + { + "epoch": 2.7406182419539498, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.9075076580047607, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8692978620529175, + "num_tokens": 822025343.0, + "step": 21544 + }, + { + "epoch": 2.7407454522325403, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8737765550613403, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8544336557388306, + "num_tokens": 822067651.0, + "step": 21545 + }, + { + "epoch": 2.740872662511131, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.91989004611969, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8579564094543457, + "num_tokens": 822102885.0, + "step": 21546 + }, + { + "epoch": 2.7409998727897213, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8979692459106445, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8568655252456665, + "num_tokens": 822142515.0, + "step": 21547 + }, + { + "epoch": 2.741127083068312, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0817737579345703, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8619518876075745, + "num_tokens": 822180745.0, + "step": 21548 + }, + { + "epoch": 2.7412542933469024, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8238613605499268, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.875572919845581, + "num_tokens": 822215117.0, + "step": 21549 + }, + { + "epoch": 2.741381503625493, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8051526546478271, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8564276695251465, + "num_tokens": 822258883.0, + "step": 21550 + }, + { + "epoch": 2.7415087139040835, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9514503479003906, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8632187843322754, + "num_tokens": 822295254.0, + "step": 21551 + }, + { + "epoch": 2.741635924182674, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.855342149734497, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8687520623207092, + "num_tokens": 822335769.0, + "step": 21552 + }, + { + "epoch": 2.7417631344612645, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.818129539489746, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8757028579711914, + "num_tokens": 822376873.0, + "step": 21553 + }, + { + "epoch": 2.741890344739855, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.806225061416626, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8613778352737427, + "num_tokens": 822419868.0, + "step": 21554 + }, + { + "epoch": 2.7420175550184456, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8201327323913574, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8654086589813232, + "num_tokens": 822456757.0, + "step": 21555 + }, + { + "epoch": 2.742144765297036, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9391473531723022, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8798491954803467, + "num_tokens": 822492772.0, + "step": 21556 + }, + { + "epoch": 2.7422719755756266, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.718472957611084, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8487454056739807, + "num_tokens": 822535400.0, + "step": 21557 + }, + { + "epoch": 2.742399185854217, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9875774383544922, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8723161220550537, + "num_tokens": 822568313.0, + "step": 21558 + }, + { + "epoch": 2.7425263961328077, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0306224822998047, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8721978664398193, + "num_tokens": 822608487.0, + "step": 21559 + }, + { + "epoch": 2.742653606411398, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8167060613632202, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8656070828437805, + "num_tokens": 822647938.0, + "step": 21560 + }, + { + "epoch": 2.7427808166899883, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9583357572555542, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8629852533340454, + "num_tokens": 822685974.0, + "step": 21561 + }, + { + "epoch": 2.7429080269685793, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0705535411834717, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8690513968467712, + "num_tokens": 822719166.0, + "step": 21562 + }, + { + "epoch": 2.7430352372471694, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8626067638397217, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8549481630325317, + "num_tokens": 822758364.0, + "step": 21563 + }, + { + "epoch": 2.7431624475257603, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.620168685913086, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.87397301197052, + "num_tokens": 822798002.0, + "step": 21564 + }, + { + "epoch": 2.7432896578043504, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8605165481567383, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8669763803482056, + "num_tokens": 822837958.0, + "step": 21565 + }, + { + "epoch": 2.743416868082941, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9887195825576782, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8656639456748962, + "num_tokens": 822873076.0, + "step": 21566 + }, + { + "epoch": 2.7435440783615315, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9109573364257812, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8643578886985779, + "num_tokens": 822904693.0, + "step": 21567 + }, + { + "epoch": 2.743671288640122, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.93193519115448, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8610503673553467, + "num_tokens": 822941192.0, + "step": 21568 + }, + { + "epoch": 2.7437984989187125, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7808390855789185, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.853335976600647, + "num_tokens": 822982166.0, + "step": 21569 + }, + { + "epoch": 2.743925709197303, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.097381353378296, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.859368085861206, + "num_tokens": 823021630.0, + "step": 21570 + }, + { + "epoch": 2.7440529194758936, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8801796436309814, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8644636869430542, + "num_tokens": 823062133.0, + "step": 21571 + }, + { + "epoch": 2.744180129754484, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9276303052902222, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8617645502090454, + "num_tokens": 823105163.0, + "step": 21572 + }, + { + "epoch": 2.7443073400330746, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 8.651930809020996, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8468248248100281, + "num_tokens": 823146878.0, + "step": 21573 + }, + { + "epoch": 2.744434550311665, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0372567176818848, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8825159072875977, + "num_tokens": 823182216.0, + "step": 21574 + }, + { + "epoch": 2.7445617605902557, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.839375615119934, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.873479962348938, + "num_tokens": 823227838.0, + "step": 21575 + }, + { + "epoch": 2.7446889708688462, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8810179233551025, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8574529886245728, + "num_tokens": 823268609.0, + "step": 21576 + }, + { + "epoch": 2.7448161811474368, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.491036891937256, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8760185241699219, + "num_tokens": 823309426.0, + "step": 21577 + }, + { + "epoch": 2.7449433914260273, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9971978664398193, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8599245548248291, + "num_tokens": 823346049.0, + "step": 21578 + }, + { + "epoch": 2.745070601704618, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0591444969177246, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8589041233062744, + "num_tokens": 823378915.0, + "step": 21579 + }, + { + "epoch": 2.7451978119832083, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8878016471862793, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8559505343437195, + "num_tokens": 823422985.0, + "step": 21580 + }, + { + "epoch": 2.745325022261799, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8971720933914185, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8530827760696411, + "num_tokens": 823460703.0, + "step": 21581 + }, + { + "epoch": 2.7454522325403894, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.809733271598816, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8678314089775085, + "num_tokens": 823499651.0, + "step": 21582 + }, + { + "epoch": 2.74557944281898, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1101930141448975, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.865408182144165, + "num_tokens": 823537044.0, + "step": 21583 + }, + { + "epoch": 2.7457066530975704, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9662805795669556, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8792009353637695, + "num_tokens": 823571832.0, + "step": 21584 + }, + { + "epoch": 2.745833863376161, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8295536041259766, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.872211754322052, + "num_tokens": 823609852.0, + "step": 21585 + }, + { + "epoch": 2.745961073654751, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.985286831855774, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8691369295120239, + "num_tokens": 823644419.0, + "step": 21586 + }, + { + "epoch": 2.746088283933342, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.851101040840149, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8707444667816162, + "num_tokens": 823682451.0, + "step": 21587 + }, + { + "epoch": 2.746215494211932, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8978121280670166, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8771226406097412, + "num_tokens": 823723051.0, + "step": 21588 + }, + { + "epoch": 2.746342704490523, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9096715450286865, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8629094362258911, + "num_tokens": 823761353.0, + "step": 21589 + }, + { + "epoch": 2.746469914769113, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.934606909751892, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8512378334999084, + "num_tokens": 823800281.0, + "step": 21590 + }, + { + "epoch": 2.7465971250477037, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.432924270629883, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8791167140007019, + "num_tokens": 823843255.0, + "step": 21591 + }, + { + "epoch": 2.7467243353262942, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9358466863632202, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8627648949623108, + "num_tokens": 823880736.0, + "step": 21592 + }, + { + "epoch": 2.7468515456048848, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2362096309661865, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8633161187171936, + "num_tokens": 823923232.0, + "step": 21593 + }, + { + "epoch": 2.7469787558834753, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9279600381851196, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8699241876602173, + "num_tokens": 823957488.0, + "step": 21594 + }, + { + "epoch": 2.747105966162066, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7191777229309082, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.861847460269928, + "num_tokens": 823999568.0, + "step": 21595 + }, + { + "epoch": 2.7472331764406563, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8134822845458984, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8766323924064636, + "num_tokens": 824036947.0, + "step": 21596 + }, + { + "epoch": 2.747360386719247, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.065945625305176, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8781800270080566, + "num_tokens": 824067966.0, + "step": 21597 + }, + { + "epoch": 2.7474875969978374, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.927276611328125, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8626484870910645, + "num_tokens": 824112216.0, + "step": 21598 + }, + { + "epoch": 2.747614807276428, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7897206544876099, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8756167888641357, + "num_tokens": 824148749.0, + "step": 21599 + }, + { + "epoch": 2.7477420175550185, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9667913913726807, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8639169931411743, + "num_tokens": 824186685.0, + "step": 21600 + }, + { + "epoch": 2.747869227833609, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.080673933029175, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8604648113250732, + "num_tokens": 824223716.0, + "step": 21601 + }, + { + "epoch": 2.7479964381121995, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 16.480175018310547, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8643383979797363, + "num_tokens": 824264627.0, + "step": 21602 + }, + { + "epoch": 2.74812364839079, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0816152095794678, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8666096925735474, + "num_tokens": 824301901.0, + "step": 21603 + }, + { + "epoch": 2.7482508586693806, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.047762393951416, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8700034618377686, + "num_tokens": 824344621.0, + "step": 21604 + }, + { + "epoch": 2.748378068947971, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9810961484909058, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8719396591186523, + "num_tokens": 824385850.0, + "step": 21605 + }, + { + "epoch": 2.7485052792265616, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8999418020248413, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8924630880355835, + "num_tokens": 824425545.0, + "step": 21606 + }, + { + "epoch": 2.748632489505152, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6686129570007324, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8822766542434692, + "num_tokens": 824468611.0, + "step": 21607 + }, + { + "epoch": 2.7487596997837427, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7035529613494873, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8706921339035034, + "num_tokens": 824512161.0, + "step": 21608 + }, + { + "epoch": 2.7488869100623328, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7709256410598755, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8792162537574768, + "num_tokens": 824555502.0, + "step": 21609 + }, + { + "epoch": 2.7490141203409237, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.92693030834198, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8604426383972168, + "num_tokens": 824596277.0, + "step": 21610 + }, + { + "epoch": 2.749141330619514, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.8391714096069336, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8541774749755859, + "num_tokens": 824633704.0, + "step": 21611 + }, + { + "epoch": 2.749268540898105, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.97258722782135, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8702334761619568, + "num_tokens": 824673584.0, + "step": 21612 + }, + { + "epoch": 2.749395751176695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0416855812072754, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8580427169799805, + "num_tokens": 824712508.0, + "step": 21613 + }, + { + "epoch": 2.749522961455286, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7399473190307617, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8768727779388428, + "num_tokens": 824754353.0, + "step": 21614 + }, + { + "epoch": 2.749650171733876, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7615071535110474, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8496032953262329, + "num_tokens": 824796333.0, + "step": 21615 + }, + { + "epoch": 2.7497773820124665, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9990570545196533, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8592091798782349, + "num_tokens": 824832048.0, + "step": 21616 + }, + { + "epoch": 2.749904592291057, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9803885221481323, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8598425388336182, + "num_tokens": 824871889.0, + "step": 21617 + }, + { + "epoch": 2.7500318025696475, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9252610206604004, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8767199516296387, + "num_tokens": 824912547.0, + "step": 21618 + }, + { + "epoch": 2.750159012848238, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8093044757843018, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.847213625907898, + "num_tokens": 824955626.0, + "step": 21619 + }, + { + "epoch": 2.7502862231268286, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8283405303955078, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8741309642791748, + "num_tokens": 824991813.0, + "step": 21620 + }, + { + "epoch": 2.750413433405419, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9604642391204834, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8746391534805298, + "num_tokens": 825024356.0, + "step": 21621 + }, + { + "epoch": 2.7505406436840096, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0294651985168457, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8631253242492676, + "num_tokens": 825060316.0, + "step": 21622 + }, + { + "epoch": 2.7506678539626, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.147564649581909, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8587859869003296, + "num_tokens": 825096044.0, + "step": 21623 + }, + { + "epoch": 2.7507950642411907, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 16.52313995361328, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8635429739952087, + "num_tokens": 825123999.0, + "step": 21624 + }, + { + "epoch": 2.750922274519781, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2245471477508545, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8541203141212463, + "num_tokens": 825170155.0, + "step": 21625 + }, + { + "epoch": 2.7510494847983717, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9682713747024536, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8638628125190735, + "num_tokens": 825213388.0, + "step": 21626 + }, + { + "epoch": 2.7511766950769623, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9736031293869019, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8719134330749512, + "num_tokens": 825247171.0, + "step": 21627 + }, + { + "epoch": 2.751303905355553, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9042024612426758, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8659158945083618, + "num_tokens": 825283271.0, + "step": 21628 + }, + { + "epoch": 2.7514311156341433, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9165375232696533, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8648158311843872, + "num_tokens": 825318976.0, + "step": 21629 + }, + { + "epoch": 2.751558325912734, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.023486375808716, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8662376403808594, + "num_tokens": 825358019.0, + "step": 21630 + }, + { + "epoch": 2.7516855361913244, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.937617301940918, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8838014602661133, + "num_tokens": 825394025.0, + "step": 21631 + }, + { + "epoch": 2.751812746469915, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8099348545074463, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8773159384727478, + "num_tokens": 825433021.0, + "step": 21632 + }, + { + "epoch": 2.7519399567485054, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.774833083152771, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8657523393630981, + "num_tokens": 825477026.0, + "step": 21633 + }, + { + "epoch": 2.7520671670270955, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1992697715759277, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8716943860054016, + "num_tokens": 825511054.0, + "step": 21634 + }, + { + "epoch": 2.7521943773056865, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8776869773864746, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8532482385635376, + "num_tokens": 825549654.0, + "step": 21635 + }, + { + "epoch": 2.7523215875842766, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0317041873931885, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8687978386878967, + "num_tokens": 825587269.0, + "step": 21636 + }, + { + "epoch": 2.7524487978628676, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0494585037231445, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8894937634468079, + "num_tokens": 825624864.0, + "step": 21637 + }, + { + "epoch": 2.7525760081414576, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.179434299468994, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8578698635101318, + "num_tokens": 825667025.0, + "step": 21638 + }, + { + "epoch": 2.7527032184200486, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.865735411643982, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8502779006958008, + "num_tokens": 825709505.0, + "step": 21639 + }, + { + "epoch": 2.7528304286986387, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8010616302490234, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8692131042480469, + "num_tokens": 825747376.0, + "step": 21640 + }, + { + "epoch": 2.7529576389772292, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.837726354598999, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8736433982849121, + "num_tokens": 825784817.0, + "step": 21641 + }, + { + "epoch": 2.7530848492558198, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9757486581802368, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8625392913818359, + "num_tokens": 825815631.0, + "step": 21642 + }, + { + "epoch": 2.7532120595344103, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8158186674118042, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8654423356056213, + "num_tokens": 825854202.0, + "step": 21643 + }, + { + "epoch": 2.753339269813001, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8988620042800903, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8728873133659363, + "num_tokens": 825890636.0, + "step": 21644 + }, + { + "epoch": 2.7534664800915913, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9194130897521973, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8421982526779175, + "num_tokens": 825931054.0, + "step": 21645 + }, + { + "epoch": 2.753593690370182, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.889814853668213, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8619686365127563, + "num_tokens": 825972519.0, + "step": 21646 + }, + { + "epoch": 2.7537209006487724, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8438003063201904, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8691604137420654, + "num_tokens": 826010010.0, + "step": 21647 + }, + { + "epoch": 2.753848110927363, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9112434387207031, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.865990936756134, + "num_tokens": 826048348.0, + "step": 21648 + }, + { + "epoch": 2.7539753212059535, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7966958284378052, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8627199530601501, + "num_tokens": 826087807.0, + "step": 21649 + }, + { + "epoch": 2.754102531484544, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9220795631408691, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8622472286224365, + "num_tokens": 826124657.0, + "step": 21650 + }, + { + "epoch": 2.7542297417631345, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1243762969970703, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8683384656906128, + "num_tokens": 826164190.0, + "step": 21651 + }, + { + "epoch": 2.754356952041725, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8396083116531372, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8683456182479858, + "num_tokens": 826202876.0, + "step": 21652 + }, + { + "epoch": 2.7544841623203156, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8577313423156738, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8638559579849243, + "num_tokens": 826242955.0, + "step": 21653 + }, + { + "epoch": 2.754611372598906, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8224530220031738, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8754929304122925, + "num_tokens": 826279925.0, + "step": 21654 + }, + { + "epoch": 2.7547385828774966, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 16.612197875976562, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8748180866241455, + "num_tokens": 826314076.0, + "step": 21655 + }, + { + "epoch": 2.754865793156087, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.008810043334961, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.863190770149231, + "num_tokens": 826354138.0, + "step": 21656 + }, + { + "epoch": 2.7549930034346777, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.022669553756714, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8606893420219421, + "num_tokens": 826395393.0, + "step": 21657 + }, + { + "epoch": 2.755120213713268, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.206902027130127, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8551539182662964, + "num_tokens": 826437539.0, + "step": 21658 + }, + { + "epoch": 2.7552474239918583, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9363696575164795, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.858634352684021, + "num_tokens": 826475857.0, + "step": 21659 + }, + { + "epoch": 2.7553746342704493, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8657900094985962, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8748971223831177, + "num_tokens": 826513956.0, + "step": 21660 + }, + { + "epoch": 2.7555018445490393, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9688035249710083, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8629484176635742, + "num_tokens": 826552747.0, + "step": 21661 + }, + { + "epoch": 2.7556290548276303, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.971457839012146, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8718188405036926, + "num_tokens": 826590830.0, + "step": 21662 + }, + { + "epoch": 2.7557562651062204, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8589019775390625, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8688623905181885, + "num_tokens": 826633414.0, + "step": 21663 + }, + { + "epoch": 2.755883475384811, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9511597156524658, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8749513626098633, + "num_tokens": 826668344.0, + "step": 21664 + }, + { + "epoch": 2.7560106856634015, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8263224363327026, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8506338596343994, + "num_tokens": 826710587.0, + "step": 21665 + }, + { + "epoch": 2.756137895941992, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8915997743606567, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8555916547775269, + "num_tokens": 826748854.0, + "step": 21666 + }, + { + "epoch": 2.7562651062205825, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0035815238952637, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8531016707420349, + "num_tokens": 826789591.0, + "step": 21667 + }, + { + "epoch": 2.756392316499173, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.852984070777893, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8618910312652588, + "num_tokens": 826831890.0, + "step": 21668 + }, + { + "epoch": 2.7565195267777636, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.042861223220825, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8483644723892212, + "num_tokens": 826870265.0, + "step": 21669 + }, + { + "epoch": 2.756646737056354, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1586549282073975, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8653609752655029, + "num_tokens": 826910993.0, + "step": 21670 + }, + { + "epoch": 2.7567739473349446, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9308421611785889, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8647177219390869, + "num_tokens": 826947492.0, + "step": 21671 + }, + { + "epoch": 2.756901157613535, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8926663398742676, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8676749467849731, + "num_tokens": 826987215.0, + "step": 21672 + }, + { + "epoch": 2.7570283678921257, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9086042642593384, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8628944158554077, + "num_tokens": 827024092.0, + "step": 21673 + }, + { + "epoch": 2.757155578170716, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1348326206207275, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8490827679634094, + "num_tokens": 827060541.0, + "step": 21674 + }, + { + "epoch": 2.7572827884493067, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8727953433990479, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8650028109550476, + "num_tokens": 827101205.0, + "step": 21675 + }, + { + "epoch": 2.7574099987278973, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.054860830307007, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.859687089920044, + "num_tokens": 827132441.0, + "step": 21676 + }, + { + "epoch": 2.757537209006488, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.790895700454712, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8651604652404785, + "num_tokens": 827169530.0, + "step": 21677 + }, + { + "epoch": 2.7576644192850783, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9153081178665161, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8709269762039185, + "num_tokens": 827207352.0, + "step": 21678 + }, + { + "epoch": 2.757791629563669, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.862154483795166, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.858985185623169, + "num_tokens": 827242790.0, + "step": 21679 + }, + { + "epoch": 2.7579188398422594, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8271980285644531, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8668315410614014, + "num_tokens": 827281676.0, + "step": 21680 + }, + { + "epoch": 2.75804605012085, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9475188255310059, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8774057030677795, + "num_tokens": 827313676.0, + "step": 21681 + }, + { + "epoch": 2.7581732603994404, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8945034742355347, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8651474118232727, + "num_tokens": 827355017.0, + "step": 21682 + }, + { + "epoch": 2.758300470678031, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8679087162017822, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8828380107879639, + "num_tokens": 827391773.0, + "step": 21683 + }, + { + "epoch": 2.758427680956621, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0087928771972656, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8814662098884583, + "num_tokens": 827433034.0, + "step": 21684 + }, + { + "epoch": 2.758554891235212, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.090689182281494, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8770979642868042, + "num_tokens": 827475729.0, + "step": 21685 + }, + { + "epoch": 2.758682101513802, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8813244104385376, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8691912889480591, + "num_tokens": 827514127.0, + "step": 21686 + }, + { + "epoch": 2.758809311792393, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.213108777999878, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8767906427383423, + "num_tokens": 827546709.0, + "step": 21687 + }, + { + "epoch": 2.758936522070983, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.962295651435852, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8852407932281494, + "num_tokens": 827581450.0, + "step": 21688 + }, + { + "epoch": 2.7590637323495737, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9029417037963867, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.862004280090332, + "num_tokens": 827623161.0, + "step": 21689 + }, + { + "epoch": 2.7591909426281642, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9163273572921753, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8630845546722412, + "num_tokens": 827659460.0, + "step": 21690 + }, + { + "epoch": 2.7593181529067548, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8123348951339722, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8605865240097046, + "num_tokens": 827701110.0, + "step": 21691 + }, + { + "epoch": 2.7594453631853453, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9687855243682861, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.862859845161438, + "num_tokens": 827738992.0, + "step": 21692 + }, + { + "epoch": 2.759572573463936, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.866855263710022, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8509313464164734, + "num_tokens": 827775853.0, + "step": 21693 + }, + { + "epoch": 2.7596997837425263, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7397044897079468, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8677493333816528, + "num_tokens": 827819438.0, + "step": 21694 + }, + { + "epoch": 2.759826994021117, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9193646907806396, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8585925698280334, + "num_tokens": 827858833.0, + "step": 21695 + }, + { + "epoch": 2.7599542042997074, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9498045444488525, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8576823472976685, + "num_tokens": 827895069.0, + "step": 21696 + }, + { + "epoch": 2.760081414578298, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8219599723815918, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8808197379112244, + "num_tokens": 827932970.0, + "step": 21697 + }, + { + "epoch": 2.7602086248568884, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9702777862548828, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.877565324306488, + "num_tokens": 827981859.0, + "step": 21698 + }, + { + "epoch": 2.760335835135479, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8683674335479736, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8600077629089355, + "num_tokens": 828022018.0, + "step": 21699 + }, + { + "epoch": 2.7604630454140695, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8529906272888184, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8615783452987671, + "num_tokens": 828061546.0, + "step": 21700 + }, + { + "epoch": 2.76059025569266, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0213446617126465, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.852134108543396, + "num_tokens": 828104464.0, + "step": 21701 + }, + { + "epoch": 2.7607174659712506, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9317408800125122, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8545314073562622, + "num_tokens": 828142527.0, + "step": 21702 + }, + { + "epoch": 2.760844676249841, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7943476438522339, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8670090436935425, + "num_tokens": 828189670.0, + "step": 21703 + }, + { + "epoch": 2.7609718865284316, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.803857445716858, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8733699321746826, + "num_tokens": 828231511.0, + "step": 21704 + }, + { + "epoch": 2.761099096807022, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7303963899612427, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8658591508865356, + "num_tokens": 828274387.0, + "step": 21705 + }, + { + "epoch": 2.7612263070856127, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8702380657196045, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8587343096733093, + "num_tokens": 828312263.0, + "step": 21706 + }, + { + "epoch": 2.7613535173642028, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0355961322784424, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8671411275863647, + "num_tokens": 828345550.0, + "step": 21707 + }, + { + "epoch": 2.7614807276427937, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.010915756225586, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8535323739051819, + "num_tokens": 828378633.0, + "step": 21708 + }, + { + "epoch": 2.761607937921384, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9947006702423096, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8589425086975098, + "num_tokens": 828414292.0, + "step": 21709 + }, + { + "epoch": 2.761735148199975, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.901208519935608, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8653185367584229, + "num_tokens": 828452682.0, + "step": 21710 + }, + { + "epoch": 2.761862358478565, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9615287780761719, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8823459148406982, + "num_tokens": 828494162.0, + "step": 21711 + }, + { + "epoch": 2.761989568757156, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9207947254180908, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8700551986694336, + "num_tokens": 828525997.0, + "step": 21712 + }, + { + "epoch": 2.762116779035746, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8738421201705933, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8635953664779663, + "num_tokens": 828562488.0, + "step": 21713 + }, + { + "epoch": 2.7622439893143365, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.120023727416992, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8832834362983704, + "num_tokens": 828591818.0, + "step": 21714 + }, + { + "epoch": 2.762371199592927, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9808534383773804, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8536686897277832, + "num_tokens": 828628872.0, + "step": 21715 + }, + { + "epoch": 2.7624984098715175, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8172529935836792, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8739511370658875, + "num_tokens": 828670939.0, + "step": 21716 + }, + { + "epoch": 2.762625620150108, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9631919860839844, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8467956781387329, + "num_tokens": 828708446.0, + "step": 21717 + }, + { + "epoch": 2.7627528304286986, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7945771217346191, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.870512843132019, + "num_tokens": 828749873.0, + "step": 21718 + }, + { + "epoch": 2.762880040707289, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.836251974105835, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8581410050392151, + "num_tokens": 828790066.0, + "step": 21719 + }, + { + "epoch": 2.7630072509858796, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.804527521133423, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8631446361541748, + "num_tokens": 828821911.0, + "step": 21720 + }, + { + "epoch": 2.76313446126447, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9228461980819702, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8588308691978455, + "num_tokens": 828856572.0, + "step": 21721 + }, + { + "epoch": 2.7632616715430607, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.734355092048645, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8777937889099121, + "num_tokens": 828898865.0, + "step": 21722 + }, + { + "epoch": 2.763388881821651, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8666096925735474, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8736367225646973, + "num_tokens": 828933768.0, + "step": 21723 + }, + { + "epoch": 2.7635160921002417, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9443767070770264, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.880316436290741, + "num_tokens": 828968869.0, + "step": 21724 + }, + { + "epoch": 2.7636433023788323, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0194780826568604, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8501286506652832, + "num_tokens": 829005621.0, + "step": 21725 + }, + { + "epoch": 2.763770512657423, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8249001502990723, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8539225459098816, + "num_tokens": 829045046.0, + "step": 21726 + }, + { + "epoch": 2.7638977229360133, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1376380920410156, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8788076639175415, + "num_tokens": 829084588.0, + "step": 21727 + }, + { + "epoch": 2.764024933214604, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9037901163101196, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8489804267883301, + "num_tokens": 829121010.0, + "step": 21728 + }, + { + "epoch": 2.7641521434931944, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.94008469581604, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8752261996269226, + "num_tokens": 829159618.0, + "step": 21729 + }, + { + "epoch": 2.764279353771785, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9354925155639648, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8523808121681213, + "num_tokens": 829204826.0, + "step": 21730 + }, + { + "epoch": 2.7644065640503754, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8839470148086548, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8753117918968201, + "num_tokens": 829240741.0, + "step": 21731 + }, + { + "epoch": 2.7645337743289655, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8225131034851074, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8855646848678589, + "num_tokens": 829278065.0, + "step": 21732 + }, + { + "epoch": 2.7646609846075565, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7592331171035767, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8621078729629517, + "num_tokens": 829319438.0, + "step": 21733 + }, + { + "epoch": 2.7647881948861466, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.755035400390625, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8755719065666199, + "num_tokens": 829357299.0, + "step": 21734 + }, + { + "epoch": 2.7649154051647375, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7875596284866333, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8677878379821777, + "num_tokens": 829398600.0, + "step": 21735 + }, + { + "epoch": 2.7650426154433276, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.010533571243286, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8544420003890991, + "num_tokens": 829439214.0, + "step": 21736 + }, + { + "epoch": 2.7651698257219186, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9240649938583374, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8534519076347351, + "num_tokens": 829477634.0, + "step": 21737 + }, + { + "epoch": 2.7652970360005087, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7849938869476318, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8724928498268127, + "num_tokens": 829514527.0, + "step": 21738 + }, + { + "epoch": 2.765424246279099, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8650639057159424, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8667137026786804, + "num_tokens": 829558531.0, + "step": 21739 + }, + { + "epoch": 2.7655514565576897, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.835191249847412, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8689101934432983, + "num_tokens": 829600769.0, + "step": 21740 + }, + { + "epoch": 2.7656786668362803, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0445497035980225, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8711740970611572, + "num_tokens": 829635938.0, + "step": 21741 + }, + { + "epoch": 2.765805877114871, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.344531774520874, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8651340007781982, + "num_tokens": 829672956.0, + "step": 21742 + }, + { + "epoch": 2.7659330873934613, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.833820104598999, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8648334741592407, + "num_tokens": 829712658.0, + "step": 21743 + }, + { + "epoch": 2.766060297672052, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7659986019134521, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8684484958648682, + "num_tokens": 829755429.0, + "step": 21744 + }, + { + "epoch": 2.7661875079506424, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.885526180267334, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8636301159858704, + "num_tokens": 829796373.0, + "step": 21745 + }, + { + "epoch": 2.766314718229233, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.848387360572815, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8613688945770264, + "num_tokens": 829834049.0, + "step": 21746 + }, + { + "epoch": 2.7664419285078234, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8496302366256714, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8653843402862549, + "num_tokens": 829870853.0, + "step": 21747 + }, + { + "epoch": 2.766569138786414, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9896007776260376, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8594736456871033, + "num_tokens": 829907519.0, + "step": 21748 + }, + { + "epoch": 2.7666963490650045, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8924918174743652, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8524762392044067, + "num_tokens": 829947162.0, + "step": 21749 + }, + { + "epoch": 2.766823559343595, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8000617027282715, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8632887601852417, + "num_tokens": 829991981.0, + "step": 21750 + }, + { + "epoch": 2.7669507696221856, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7936760187149048, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8719668388366699, + "num_tokens": 830032409.0, + "step": 21751 + }, + { + "epoch": 2.767077979900776, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7797771692276, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8682817816734314, + "num_tokens": 830072738.0, + "step": 21752 + }, + { + "epoch": 2.7672051901793666, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8043869733810425, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8615615367889404, + "num_tokens": 830110880.0, + "step": 21753 + }, + { + "epoch": 2.767332400457957, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9546358585357666, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8599964380264282, + "num_tokens": 830150148.0, + "step": 21754 + }, + { + "epoch": 2.7674596107365477, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.873792290687561, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8534108400344849, + "num_tokens": 830190717.0, + "step": 21755 + }, + { + "epoch": 2.767586821015138, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.557777166366577, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8674230575561523, + "num_tokens": 830226972.0, + "step": 21756 + }, + { + "epoch": 2.7677140312937283, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9475126266479492, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.875887393951416, + "num_tokens": 830262612.0, + "step": 21757 + }, + { + "epoch": 2.7678412415723193, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.685036063194275, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8712279796600342, + "num_tokens": 830306856.0, + "step": 21758 + }, + { + "epoch": 2.7679684518509093, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9551384449005127, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8638699054718018, + "num_tokens": 830340726.0, + "step": 21759 + }, + { + "epoch": 2.7680956621295003, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7965401411056519, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8656443953514099, + "num_tokens": 830380697.0, + "step": 21760 + }, + { + "epoch": 2.7682228724080904, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9447954893112183, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8436545133590698, + "num_tokens": 830414737.0, + "step": 21761 + }, + { + "epoch": 2.768350082686681, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.7207155227661133, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8663334846496582, + "num_tokens": 830456518.0, + "step": 21762 + }, + { + "epoch": 2.7684772929652715, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8936119079589844, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.863196074962616, + "num_tokens": 830494192.0, + "step": 21763 + }, + { + "epoch": 2.768604503243862, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9697071313858032, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8571698665618896, + "num_tokens": 830533659.0, + "step": 21764 + }, + { + "epoch": 2.7687317135224525, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.548125743865967, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8593567609786987, + "num_tokens": 830569605.0, + "step": 21765 + }, + { + "epoch": 2.768858923801043, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8641166687011719, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8437160849571228, + "num_tokens": 830610814.0, + "step": 21766 + }, + { + "epoch": 2.7689861340796336, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.00136137008667, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.86418217420578, + "num_tokens": 830646811.0, + "step": 21767 + }, + { + "epoch": 2.769113344358224, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.956209659576416, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8704596757888794, + "num_tokens": 830680984.0, + "step": 21768 + }, + { + "epoch": 2.7692405546368146, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.1436359882354736, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8646169304847717, + "num_tokens": 830713384.0, + "step": 21769 + }, + { + "epoch": 2.769367764915405, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0631916522979736, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8529184460639954, + "num_tokens": 830746922.0, + "step": 21770 + }, + { + "epoch": 2.7694949751939957, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.08026385307312, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8717702627182007, + "num_tokens": 830789459.0, + "step": 21771 + }, + { + "epoch": 2.769622185472586, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9238251447677612, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8629250526428223, + "num_tokens": 830826736.0, + "step": 21772 + }, + { + "epoch": 2.7697493957511767, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7159143686294556, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8684208393096924, + "num_tokens": 830868118.0, + "step": 21773 + }, + { + "epoch": 2.7698766060297673, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8251999616622925, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8631631135940552, + "num_tokens": 830911658.0, + "step": 21774 + }, + { + "epoch": 2.770003816308358, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9414676427841187, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8660768866539001, + "num_tokens": 830948456.0, + "step": 21775 + }, + { + "epoch": 2.7701310265869483, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.019854784011841, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.874823808670044, + "num_tokens": 830988790.0, + "step": 21776 + }, + { + "epoch": 2.770258236865539, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9306364059448242, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8581669926643372, + "num_tokens": 831024551.0, + "step": 21777 + }, + { + "epoch": 2.7703854471441294, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9975526332855225, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8632912635803223, + "num_tokens": 831060082.0, + "step": 21778 + }, + { + "epoch": 2.77051265742272, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.851640224456787, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8667929172515869, + "num_tokens": 831100067.0, + "step": 21779 + }, + { + "epoch": 2.77063986770131, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.823512315750122, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8760198950767517, + "num_tokens": 831131923.0, + "step": 21780 + }, + { + "epoch": 2.770767077979901, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.025743246078491, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8491586446762085, + "num_tokens": 831167985.0, + "step": 21781 + }, + { + "epoch": 2.770894288258491, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7461203336715698, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8592720031738281, + "num_tokens": 831211737.0, + "step": 21782 + }, + { + "epoch": 2.771021498537082, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.871660828590393, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8537826538085938, + "num_tokens": 831248764.0, + "step": 21783 + }, + { + "epoch": 2.771148708815672, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.837868332862854, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8723763227462769, + "num_tokens": 831284140.0, + "step": 21784 + }, + { + "epoch": 2.771275919094263, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.815203070640564, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8689848780632019, + "num_tokens": 831318567.0, + "step": 21785 + }, + { + "epoch": 2.771403129372853, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8339747190475464, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8592592477798462, + "num_tokens": 831357128.0, + "step": 21786 + }, + { + "epoch": 2.7715303396514437, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.058682680130005, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8432198762893677, + "num_tokens": 831391551.0, + "step": 21787 + }, + { + "epoch": 2.771657549930034, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9417935609817505, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8610562086105347, + "num_tokens": 831430332.0, + "step": 21788 + }, + { + "epoch": 2.7717847602086247, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8750146627426147, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8695342540740967, + "num_tokens": 831471639.0, + "step": 21789 + }, + { + "epoch": 2.7719119704872153, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.874399185180664, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8703665733337402, + "num_tokens": 831506470.0, + "step": 21790 + }, + { + "epoch": 2.772039180765806, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8084383010864258, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8588002324104309, + "num_tokens": 831547212.0, + "step": 21791 + }, + { + "epoch": 2.7721663910443963, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9983937740325928, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8511112928390503, + "num_tokens": 831593181.0, + "step": 21792 + }, + { + "epoch": 2.772293601322987, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9505815505981445, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8817348480224609, + "num_tokens": 831624889.0, + "step": 21793 + }, + { + "epoch": 2.7724208116015774, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8648741245269775, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8719394207000732, + "num_tokens": 831662389.0, + "step": 21794 + }, + { + "epoch": 2.772548021880168, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8071955442428589, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.867547869682312, + "num_tokens": 831705282.0, + "step": 21795 + }, + { + "epoch": 2.7726752321587584, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8605780601501465, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8725997805595398, + "num_tokens": 831741604.0, + "step": 21796 + }, + { + "epoch": 2.772802442437349, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.8157129287719727, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8550032377243042, + "num_tokens": 831784165.0, + "step": 21797 + }, + { + "epoch": 2.7729296527159395, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.059380054473877, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8695147037506104, + "num_tokens": 831821101.0, + "step": 21798 + }, + { + "epoch": 2.77305686299453, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.007580041885376, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8676912784576416, + "num_tokens": 831855766.0, + "step": 21799 + }, + { + "epoch": 2.7731840732731206, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8359347581863403, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8694148659706116, + "num_tokens": 831895692.0, + "step": 21800 + }, + { + "epoch": 2.773311283551711, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8475550413131714, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8840862512588501, + "num_tokens": 831931079.0, + "step": 21801 + }, + { + "epoch": 2.7734384938303016, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.061383008956909, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8381752967834473, + "num_tokens": 831969742.0, + "step": 21802 + }, + { + "epoch": 2.773565704108892, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9656568765640259, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8551045656204224, + "num_tokens": 832011059.0, + "step": 21803 + }, + { + "epoch": 2.7736929143874827, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 4.058288097381592, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.876060962677002, + "num_tokens": 832042768.0, + "step": 21804 + }, + { + "epoch": 2.7738201246660728, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.029308557510376, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8736701011657715, + "num_tokens": 832077699.0, + "step": 21805 + }, + { + "epoch": 2.7739473349446637, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9229559898376465, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8729977011680603, + "num_tokens": 832113528.0, + "step": 21806 + }, + { + "epoch": 2.774074545223254, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8259860277175903, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8686236143112183, + "num_tokens": 832149108.0, + "step": 21807 + }, + { + "epoch": 2.774201755501845, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.779833197593689, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8813689947128296, + "num_tokens": 832189279.0, + "step": 21808 + }, + { + "epoch": 2.774328965780435, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8605401515960693, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8669835329055786, + "num_tokens": 832224423.0, + "step": 21809 + }, + { + "epoch": 2.774456176059026, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9980014562606812, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8600720167160034, + "num_tokens": 832260389.0, + "step": 21810 + }, + { + "epoch": 2.774583386337616, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0390501022338867, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8651330471038818, + "num_tokens": 832297591.0, + "step": 21811 + }, + { + "epoch": 2.7747105966162064, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1310033798217773, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8676086664199829, + "num_tokens": 832330195.0, + "step": 21812 + }, + { + "epoch": 2.774837806894797, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9580212831497192, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8714898824691772, + "num_tokens": 832361613.0, + "step": 21813 + }, + { + "epoch": 2.7749650171733875, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9368876218795776, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8711619973182678, + "num_tokens": 832402800.0, + "step": 21814 + }, + { + "epoch": 2.775092227451978, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1035940647125244, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8669117093086243, + "num_tokens": 832445161.0, + "step": 21815 + }, + { + "epoch": 2.7752194377305686, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9281766414642334, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8498144149780273, + "num_tokens": 832487905.0, + "step": 21816 + }, + { + "epoch": 2.775346648009159, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9923336505889893, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8649920225143433, + "num_tokens": 832521107.0, + "step": 21817 + }, + { + "epoch": 2.7754738582877496, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.880382776260376, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8622864484786987, + "num_tokens": 832557213.0, + "step": 21818 + }, + { + "epoch": 2.77560106856634, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8272662162780762, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8529755473136902, + "num_tokens": 832600445.0, + "step": 21819 + }, + { + "epoch": 2.7757282788449307, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0293309688568115, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8654036521911621, + "num_tokens": 832635994.0, + "step": 21820 + }, + { + "epoch": 2.775855489123521, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9200317859649658, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8763625621795654, + "num_tokens": 832677151.0, + "step": 21821 + }, + { + "epoch": 2.7759826994021117, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9352613687515259, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8641835451126099, + "num_tokens": 832713995.0, + "step": 21822 + }, + { + "epoch": 2.7761099096807023, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8606494665145874, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8565361499786377, + "num_tokens": 832754201.0, + "step": 21823 + }, + { + "epoch": 2.776237119959293, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8934710025787354, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8611078262329102, + "num_tokens": 832796920.0, + "step": 21824 + }, + { + "epoch": 2.7763643302378833, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9652365446090698, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8617426156997681, + "num_tokens": 832835952.0, + "step": 21825 + }, + { + "epoch": 2.776491540516474, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6968765258789062, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8688764572143555, + "num_tokens": 832878306.0, + "step": 21826 + }, + { + "epoch": 2.7766187507950644, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8541847467422485, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8567001819610596, + "num_tokens": 832914315.0, + "step": 21827 + }, + { + "epoch": 2.776745961073655, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9194786548614502, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8621294498443604, + "num_tokens": 832956661.0, + "step": 21828 + }, + { + "epoch": 2.7768731713522454, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7681926488876343, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8712738156318665, + "num_tokens": 832999446.0, + "step": 21829 + }, + { + "epoch": 2.7770003816308355, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9323172569274902, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8382623195648193, + "num_tokens": 833039001.0, + "step": 21830 + }, + { + "epoch": 2.7771275919094265, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9824552536010742, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.863898515701294, + "num_tokens": 833074334.0, + "step": 21831 + }, + { + "epoch": 2.7772548021880166, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.008929491043091, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8534899950027466, + "num_tokens": 833113089.0, + "step": 21832 + }, + { + "epoch": 2.7773820124666075, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.135204792022705, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8624730110168457, + "num_tokens": 833149755.0, + "step": 21833 + }, + { + "epoch": 2.7775092227451976, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8965400457382202, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8548221588134766, + "num_tokens": 833187718.0, + "step": 21834 + }, + { + "epoch": 2.7776364330237886, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9420229196548462, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.87039715051651, + "num_tokens": 833221416.0, + "step": 21835 + }, + { + "epoch": 2.7777636433023787, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7610540390014648, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8798417448997498, + "num_tokens": 833261101.0, + "step": 21836 + }, + { + "epoch": 2.777890853580969, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1993837356567383, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8734226226806641, + "num_tokens": 833296804.0, + "step": 21837 + }, + { + "epoch": 2.7780180638595597, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.098569631576538, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8663509488105774, + "num_tokens": 833332005.0, + "step": 21838 + }, + { + "epoch": 2.7781452741381503, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0453336238861084, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8609179258346558, + "num_tokens": 833371991.0, + "step": 21839 + }, + { + "epoch": 2.778272484416741, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0037572383880615, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.877608060836792, + "num_tokens": 833407514.0, + "step": 21840 + }, + { + "epoch": 2.7783996946953313, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9889557361602783, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8631040453910828, + "num_tokens": 833444519.0, + "step": 21841 + }, + { + "epoch": 2.778526904973922, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8418114185333252, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8604768514633179, + "num_tokens": 833482252.0, + "step": 21842 + }, + { + "epoch": 2.7786541152525124, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.990578293800354, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8645163178443909, + "num_tokens": 833515513.0, + "step": 21843 + }, + { + "epoch": 2.778781325531103, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8429309129714966, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8718042969703674, + "num_tokens": 833551188.0, + "step": 21844 + }, + { + "epoch": 2.7789085358096934, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.883490800857544, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8750030994415283, + "num_tokens": 833589159.0, + "step": 21845 + }, + { + "epoch": 2.779035746088284, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.16051983833313, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8541522026062012, + "num_tokens": 833623581.0, + "step": 21846 + }, + { + "epoch": 2.7791629563668745, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.024373769760132, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8531263470649719, + "num_tokens": 833660466.0, + "step": 21847 + }, + { + "epoch": 2.779290166645465, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.950870156288147, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8543931245803833, + "num_tokens": 833697465.0, + "step": 21848 + }, + { + "epoch": 2.7794173769240555, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8026007413864136, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8622986078262329, + "num_tokens": 833735178.0, + "step": 21849 + }, + { + "epoch": 2.779544587202646, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.047102212905884, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8801528811454773, + "num_tokens": 833770193.0, + "step": 21850 + }, + { + "epoch": 2.7796717974812366, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7768969535827637, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8657301664352417, + "num_tokens": 833807836.0, + "step": 21851 + }, + { + "epoch": 2.779799007759827, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.826513409614563, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8607891798019409, + "num_tokens": 833847605.0, + "step": 21852 + }, + { + "epoch": 2.7799262180384177, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.818051815032959, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8675646781921387, + "num_tokens": 833884239.0, + "step": 21853 + }, + { + "epoch": 2.780053428317008, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8772801160812378, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8645696640014648, + "num_tokens": 833921305.0, + "step": 21854 + }, + { + "epoch": 2.7801806385955983, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.073082208633423, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8560209274291992, + "num_tokens": 833959297.0, + "step": 21855 + }, + { + "epoch": 2.7803078488741892, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.113940954208374, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8404272794723511, + "num_tokens": 833992135.0, + "step": 21856 + }, + { + "epoch": 2.7804350591527793, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.6927874088287354, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8547626733779907, + "num_tokens": 834035933.0, + "step": 21857 + }, + { + "epoch": 2.7805622694313703, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.005542516708374, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8632805347442627, + "num_tokens": 834070187.0, + "step": 21858 + }, + { + "epoch": 2.7806894797099604, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 1.7775437831878662, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8672863245010376, + "num_tokens": 834107370.0, + "step": 21859 + }, + { + "epoch": 2.780816689988551, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 1.74003005027771, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8566727638244629, + "num_tokens": 834145357.0, + "step": 21860 + }, + { + "epoch": 2.7809439002671414, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6880594491958618, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8636379241943359, + "num_tokens": 834187250.0, + "step": 21861 + }, + { + "epoch": 2.781071110545732, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 1.8458234071731567, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8755003213882446, + "num_tokens": 834227598.0, + "step": 21862 + }, + { + "epoch": 2.7811983208243225, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.069415330886841, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8794198632240295, + "num_tokens": 834267032.0, + "step": 21863 + }, + { + "epoch": 2.781325531102913, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7754591703414917, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8541710376739502, + "num_tokens": 834308479.0, + "step": 21864 + }, + { + "epoch": 2.7814527413815036, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0019407272338867, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8746002316474915, + "num_tokens": 834346185.0, + "step": 21865 + }, + { + "epoch": 2.781579951660094, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8301995992660522, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8740831613540649, + "num_tokens": 834384404.0, + "step": 21866 + }, + { + "epoch": 2.7817071619386846, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8645548820495605, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8634989857673645, + "num_tokens": 834424833.0, + "step": 21867 + }, + { + "epoch": 2.781834372217275, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0590016841888428, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8661104440689087, + "num_tokens": 834464215.0, + "step": 21868 + }, + { + "epoch": 2.7819615824958657, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0751702785491943, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8425083160400391, + "num_tokens": 834502452.0, + "step": 21869 + }, + { + "epoch": 2.782088792774456, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 1.8831846714019775, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8664616346359253, + "num_tokens": 834539690.0, + "step": 21870 + }, + { + "epoch": 2.7822160030530467, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8242816925048828, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8538599014282227, + "num_tokens": 834583193.0, + "step": 21871 + }, + { + "epoch": 2.7823432133316373, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8912357091903687, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8587416410446167, + "num_tokens": 834620028.0, + "step": 21872 + }, + { + "epoch": 2.782470423610228, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9206427335739136, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8535494208335876, + "num_tokens": 834657351.0, + "step": 21873 + }, + { + "epoch": 2.7825976338888183, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.860357403755188, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8793165683746338, + "num_tokens": 834693687.0, + "step": 21874 + }, + { + "epoch": 2.782724844167409, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9954562187194824, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8632194995880127, + "num_tokens": 834736171.0, + "step": 21875 + }, + { + "epoch": 2.7828520544459994, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9576643705368042, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8785474300384521, + "num_tokens": 834775699.0, + "step": 21876 + }, + { + "epoch": 2.78297926472459, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2845423221588135, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8588477373123169, + "num_tokens": 834816053.0, + "step": 21877 + }, + { + "epoch": 2.78310647500318, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.001347780227661, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8607763051986694, + "num_tokens": 834857125.0, + "step": 21878 + }, + { + "epoch": 2.783233685281771, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.905806541442871, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.878673255443573, + "num_tokens": 834895676.0, + "step": 21879 + }, + { + "epoch": 2.783360895560361, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9976431131362915, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8688758015632629, + "num_tokens": 834927748.0, + "step": 21880 + }, + { + "epoch": 2.783488105838952, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8280048370361328, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8618721961975098, + "num_tokens": 834966722.0, + "step": 21881 + }, + { + "epoch": 2.783615316117542, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9998846054077148, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8664594292640686, + "num_tokens": 835003770.0, + "step": 21882 + }, + { + "epoch": 2.783742526396133, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9278347492218018, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8741625547409058, + "num_tokens": 835041504.0, + "step": 21883 + }, + { + "epoch": 2.783869736674723, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.871218204498291, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8668785095214844, + "num_tokens": 835075179.0, + "step": 21884 + }, + { + "epoch": 2.7839969469533137, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9318135976791382, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8544178009033203, + "num_tokens": 835113080.0, + "step": 21885 + }, + { + "epoch": 2.784124157231904, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8456103801727295, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8643009662628174, + "num_tokens": 835150586.0, + "step": 21886 + }, + { + "epoch": 2.7842513675104947, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.0318899154663086, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8613519072532654, + "num_tokens": 835184049.0, + "step": 21887 + }, + { + "epoch": 2.7843785777890853, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 1.970341682434082, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8612514138221741, + "num_tokens": 835222745.0, + "step": 21888 + }, + { + "epoch": 2.784505788067676, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9204018115997314, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8522762060165405, + "num_tokens": 835263017.0, + "step": 21889 + }, + { + "epoch": 2.7846329983462663, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8488744497299194, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8782845139503479, + "num_tokens": 835303916.0, + "step": 21890 + }, + { + "epoch": 2.784760208624857, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0588390827178955, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8789915442466736, + "num_tokens": 835343148.0, + "step": 21891 + }, + { + "epoch": 2.7848874189034474, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.091081380844116, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8625970482826233, + "num_tokens": 835382337.0, + "step": 21892 + }, + { + "epoch": 2.785014629182038, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0278615951538086, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8596934080123901, + "num_tokens": 835423645.0, + "step": 21893 + }, + { + "epoch": 2.7851418394606284, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9153751134872437, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8353722095489502, + "num_tokens": 835464590.0, + "step": 21894 + }, + { + "epoch": 2.785269049739219, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9871124029159546, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.889283299446106, + "num_tokens": 835496376.0, + "step": 21895 + }, + { + "epoch": 2.7853962600178095, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.948938012123108, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8494789600372314, + "num_tokens": 835541231.0, + "step": 21896 + }, + { + "epoch": 2.7855234702964, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8727401494979858, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8679632544517517, + "num_tokens": 835584038.0, + "step": 21897 + }, + { + "epoch": 2.7856506805749905, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8007992506027222, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.859067976474762, + "num_tokens": 835627189.0, + "step": 21898 + }, + { + "epoch": 2.785777890853581, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.020794630050659, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.864727258682251, + "num_tokens": 835659701.0, + "step": 21899 + }, + { + "epoch": 2.7859051011321716, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9567053318023682, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8717654347419739, + "num_tokens": 835694045.0, + "step": 21900 + }, + { + "epoch": 2.786032311410762, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8893859386444092, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.861716091632843, + "num_tokens": 835732118.0, + "step": 21901 + }, + { + "epoch": 2.7861595216893527, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.440382480621338, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8735917806625366, + "num_tokens": 835772700.0, + "step": 21902 + }, + { + "epoch": 2.7862867319679427, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.256147623062134, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8479405641555786, + "num_tokens": 835811206.0, + "step": 21903 + }, + { + "epoch": 2.7864139422465337, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0591461658477783, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8687995672225952, + "num_tokens": 835844381.0, + "step": 21904 + }, + { + "epoch": 2.786541152525124, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.070826292037964, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8567690849304199, + "num_tokens": 835879926.0, + "step": 21905 + }, + { + "epoch": 2.7866683628037148, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.968307375907898, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8522891402244568, + "num_tokens": 835924014.0, + "step": 21906 + }, + { + "epoch": 2.786795573082305, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1163222789764404, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8581228256225586, + "num_tokens": 835956054.0, + "step": 21907 + }, + { + "epoch": 2.786922783360896, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.933606505393982, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8713467717170715, + "num_tokens": 835988885.0, + "step": 21908 + }, + { + "epoch": 2.787049993639486, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8378409147262573, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8635027408599854, + "num_tokens": 836028885.0, + "step": 21909 + }, + { + "epoch": 2.7871772039180764, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8442412614822388, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8775790929794312, + "num_tokens": 836067340.0, + "step": 21910 + }, + { + "epoch": 2.787304414196667, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0207676887512207, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.863595724105835, + "num_tokens": 836104432.0, + "step": 21911 + }, + { + "epoch": 2.7874316244752575, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.019437313079834, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8609123229980469, + "num_tokens": 836141197.0, + "step": 21912 + }, + { + "epoch": 2.787558834753848, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.790709376335144, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8761415481567383, + "num_tokens": 836176949.0, + "step": 21913 + }, + { + "epoch": 2.7876860450324386, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1757328510284424, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8628698587417603, + "num_tokens": 836206484.0, + "step": 21914 + }, + { + "epoch": 2.787813255311029, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9925730228424072, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.869931697845459, + "num_tokens": 836243056.0, + "step": 21915 + }, + { + "epoch": 2.7879404655896196, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9473977088928223, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8598916530609131, + "num_tokens": 836281154.0, + "step": 21916 + }, + { + "epoch": 2.78806767586821, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9378244876861572, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8640686273574829, + "num_tokens": 836319895.0, + "step": 21917 + }, + { + "epoch": 2.7881948861468007, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 3.077625036239624, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8711178302764893, + "num_tokens": 836357227.0, + "step": 21918 + }, + { + "epoch": 2.788322096425391, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.989792823791504, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8564260601997375, + "num_tokens": 836398442.0, + "step": 21919 + }, + { + "epoch": 2.7884493067039817, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7838692665100098, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8723580837249756, + "num_tokens": 836436660.0, + "step": 21920 + }, + { + "epoch": 2.7885765169825723, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9461288452148438, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8589562177658081, + "num_tokens": 836479180.0, + "step": 21921 + }, + { + "epoch": 2.788703727261163, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.956734538078308, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8719782829284668, + "num_tokens": 836518874.0, + "step": 21922 + }, + { + "epoch": 2.7888309375397533, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8555197715759277, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8601052761077881, + "num_tokens": 836556910.0, + "step": 21923 + }, + { + "epoch": 2.788958147818344, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7409425973892212, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.876537561416626, + "num_tokens": 836598151.0, + "step": 21924 + }, + { + "epoch": 2.7890853580969344, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7963814735412598, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8576270341873169, + "num_tokens": 836637952.0, + "step": 21925 + }, + { + "epoch": 2.789212568375525, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8375262022018433, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8788073062896729, + "num_tokens": 836677138.0, + "step": 21926 + }, + { + "epoch": 2.7893397786541154, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8443670272827148, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8708223104476929, + "num_tokens": 836716880.0, + "step": 21927 + }, + { + "epoch": 2.7894669889327055, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8230351209640503, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8781493306159973, + "num_tokens": 836752572.0, + "step": 21928 + }, + { + "epoch": 2.7895941992112965, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9682872295379639, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8597156405448914, + "num_tokens": 836794321.0, + "step": 21929 + }, + { + "epoch": 2.7897214094898866, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0849733352661133, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.878766655921936, + "num_tokens": 836829183.0, + "step": 21930 + }, + { + "epoch": 2.7898486197684775, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9701218605041504, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8522179126739502, + "num_tokens": 836864644.0, + "step": 21931 + }, + { + "epoch": 2.7899758300470676, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9978176355361938, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8770748376846313, + "num_tokens": 836895384.0, + "step": 21932 + }, + { + "epoch": 2.7901030403256586, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8224806785583496, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8680024147033691, + "num_tokens": 836943336.0, + "step": 21933 + }, + { + "epoch": 2.7902302506042487, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8034080266952515, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8799067139625549, + "num_tokens": 836980708.0, + "step": 21934 + }, + { + "epoch": 2.790357460882839, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.96448814868927, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8690412640571594, + "num_tokens": 837015738.0, + "step": 21935 + }, + { + "epoch": 2.7904846711614297, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.902997374534607, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8723902702331543, + "num_tokens": 837053014.0, + "step": 21936 + }, + { + "epoch": 2.7906118814400203, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1814160346984863, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8602218627929688, + "num_tokens": 837083579.0, + "step": 21937 + }, + { + "epoch": 2.790739091718611, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.765790343284607, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8743257522583008, + "num_tokens": 837126201.0, + "step": 21938 + }, + { + "epoch": 2.7908663019972013, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9943921566009521, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8742772936820984, + "num_tokens": 837162400.0, + "step": 21939 + }, + { + "epoch": 2.790993512275792, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.679527997970581, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8696274757385254, + "num_tokens": 837205733.0, + "step": 21940 + }, + { + "epoch": 2.7911207225543824, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7885797023773193, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8539711236953735, + "num_tokens": 837243838.0, + "step": 21941 + }, + { + "epoch": 2.791247932832973, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7956465482711792, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.861060380935669, + "num_tokens": 837280936.0, + "step": 21942 + }, + { + "epoch": 2.7913751431115634, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7885515689849854, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8652878403663635, + "num_tokens": 837321790.0, + "step": 21943 + }, + { + "epoch": 2.791502353390154, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7951186895370483, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8699880242347717, + "num_tokens": 837362023.0, + "step": 21944 + }, + { + "epoch": 2.7916295636687445, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8082499504089355, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8726608753204346, + "num_tokens": 837404876.0, + "step": 21945 + }, + { + "epoch": 2.791756773947335, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.905279278755188, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8578909635543823, + "num_tokens": 837442899.0, + "step": 21946 + }, + { + "epoch": 2.7918839842259255, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.141660451889038, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.873706579208374, + "num_tokens": 837477611.0, + "step": 21947 + }, + { + "epoch": 2.792011194504516, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.945396065711975, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.851646900177002, + "num_tokens": 837515928.0, + "step": 21948 + }, + { + "epoch": 2.7921384047831066, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9917820692062378, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8651765584945679, + "num_tokens": 837553042.0, + "step": 21949 + }, + { + "epoch": 2.792265615061697, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.813011884689331, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8720149993896484, + "num_tokens": 837589749.0, + "step": 21950 + }, + { + "epoch": 2.7923928253402877, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7027184963226318, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8826656341552734, + "num_tokens": 837631104.0, + "step": 21951 + }, + { + "epoch": 2.792520035618878, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.902606725692749, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8751896619796753, + "num_tokens": 837668173.0, + "step": 21952 + }, + { + "epoch": 2.7926472458974683, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.055453300476074, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8715900182723999, + "num_tokens": 837704282.0, + "step": 21953 + }, + { + "epoch": 2.7927744561760592, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8340333700180054, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8556197881698608, + "num_tokens": 837750790.0, + "step": 21954 + }, + { + "epoch": 2.7929016664546493, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8713514804840088, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8625789880752563, + "num_tokens": 837792041.0, + "step": 21955 + }, + { + "epoch": 2.7930288767332403, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0546138286590576, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8722150325775146, + "num_tokens": 837818645.0, + "step": 21956 + }, + { + "epoch": 2.7931560870118304, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8799002170562744, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8742439150810242, + "num_tokens": 837859816.0, + "step": 21957 + }, + { + "epoch": 2.793283297290421, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7968159914016724, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8596351742744446, + "num_tokens": 837903970.0, + "step": 21958 + }, + { + "epoch": 2.7934105075690114, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0118157863616943, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8639810681343079, + "num_tokens": 837941158.0, + "step": 21959 + }, + { + "epoch": 2.793537717847602, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.847360610961914, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.871306300163269, + "num_tokens": 837978157.0, + "step": 21960 + }, + { + "epoch": 2.7936649281261925, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7059965133666992, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8760677576065063, + "num_tokens": 838017871.0, + "step": 21961 + }, + { + "epoch": 2.793792138404783, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.025684356689453, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8777065873146057, + "num_tokens": 838050521.0, + "step": 21962 + }, + { + "epoch": 2.7939193486833735, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9317302703857422, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8737460374832153, + "num_tokens": 838084450.0, + "step": 21963 + }, + { + "epoch": 2.794046558961964, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1765410900115967, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8594691753387451, + "num_tokens": 838121505.0, + "step": 21964 + }, + { + "epoch": 2.7941737692405546, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.779784917831421, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8633034229278564, + "num_tokens": 838166148.0, + "step": 21965 + }, + { + "epoch": 2.794300979519145, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9532767534255981, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.871748149394989, + "num_tokens": 838203578.0, + "step": 21966 + }, + { + "epoch": 2.7944281897977357, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9006291627883911, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8674885034561157, + "num_tokens": 838240702.0, + "step": 21967 + }, + { + "epoch": 2.794555400076326, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9544259309768677, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8731436133384705, + "num_tokens": 838275639.0, + "step": 21968 + }, + { + "epoch": 2.7946826103549167, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.205444097518921, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8683059215545654, + "num_tokens": 838310045.0, + "step": 21969 + }, + { + "epoch": 2.7948098206335072, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.831087589263916, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8666744232177734, + "num_tokens": 838351571.0, + "step": 21970 + }, + { + "epoch": 2.7949370309120978, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8020482063293457, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8775167465209961, + "num_tokens": 838393804.0, + "step": 21971 + }, + { + "epoch": 2.7950642411906883, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9431307315826416, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8676338791847229, + "num_tokens": 838435516.0, + "step": 21972 + }, + { + "epoch": 2.795191451469279, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.269162178039551, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8729473352432251, + "num_tokens": 838465425.0, + "step": 21973 + }, + { + "epoch": 2.7953186617478694, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9163261651992798, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8630314469337463, + "num_tokens": 838512832.0, + "step": 21974 + }, + { + "epoch": 2.79544587202646, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9088075160980225, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8739073276519775, + "num_tokens": 838552806.0, + "step": 21975 + }, + { + "epoch": 2.79557308230505, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9022741317749023, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8642638921737671, + "num_tokens": 838588592.0, + "step": 21976 + }, + { + "epoch": 2.795700292583641, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8109296560287476, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8646215200424194, + "num_tokens": 838629402.0, + "step": 21977 + }, + { + "epoch": 2.795827502862231, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7936370372772217, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8562958836555481, + "num_tokens": 838676580.0, + "step": 21978 + }, + { + "epoch": 2.795954713140822, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.906461477279663, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8592686057090759, + "num_tokens": 838715773.0, + "step": 21979 + }, + { + "epoch": 2.796081923419412, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.04742169380188, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8566337823867798, + "num_tokens": 838750270.0, + "step": 21980 + }, + { + "epoch": 2.796209133698003, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0760958194732666, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8447939157485962, + "num_tokens": 838794823.0, + "step": 21981 + }, + { + "epoch": 2.796336343976593, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0647757053375244, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8562309145927429, + "num_tokens": 838828901.0, + "step": 21982 + }, + { + "epoch": 2.7964635542551837, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8560471534729004, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.858428955078125, + "num_tokens": 838872648.0, + "step": 21983 + }, + { + "epoch": 2.796590764533774, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0335488319396973, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8783310651779175, + "num_tokens": 838905150.0, + "step": 21984 + }, + { + "epoch": 2.7967179748123647, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9312516450881958, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.865308940410614, + "num_tokens": 838949027.0, + "step": 21985 + }, + { + "epoch": 2.7968451850909553, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.827109456062317, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8670481443405151, + "num_tokens": 838987320.0, + "step": 21986 + }, + { + "epoch": 2.796972395369546, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9287289381027222, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8619424104690552, + "num_tokens": 839027396.0, + "step": 21987 + }, + { + "epoch": 2.7970996056481363, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6771597862243652, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8716328144073486, + "num_tokens": 839069779.0, + "step": 21988 + }, + { + "epoch": 2.797226815926727, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6985061168670654, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8708997964859009, + "num_tokens": 839110919.0, + "step": 21989 + }, + { + "epoch": 2.7973540262053174, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8343535661697388, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8799925446510315, + "num_tokens": 839143689.0, + "step": 21990 + }, + { + "epoch": 2.797481236483908, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.930071234703064, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8458881974220276, + "num_tokens": 839181274.0, + "step": 21991 + }, + { + "epoch": 2.7976084467624984, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1508607864379883, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8486089706420898, + "num_tokens": 839217038.0, + "step": 21992 + }, + { + "epoch": 2.797735657041089, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.886916160583496, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8662468791007996, + "num_tokens": 839260104.0, + "step": 21993 + }, + { + "epoch": 2.7978628673196795, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.173309087753296, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.869208574295044, + "num_tokens": 839294328.0, + "step": 21994 + }, + { + "epoch": 2.79799007759827, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0254361629486084, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8652348518371582, + "num_tokens": 839326662.0, + "step": 21995 + }, + { + "epoch": 2.7981172878768605, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1660969257354736, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8653695583343506, + "num_tokens": 839355997.0, + "step": 21996 + }, + { + "epoch": 2.798244498155451, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8008332252502441, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8716180920600891, + "num_tokens": 839396596.0, + "step": 21997 + }, + { + "epoch": 2.7983717084340416, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8867180347442627, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8679297566413879, + "num_tokens": 839432251.0, + "step": 21998 + }, + { + "epoch": 2.798498918712632, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9448716640472412, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8756784200668335, + "num_tokens": 839465987.0, + "step": 21999 + }, + { + "epoch": 2.7986261289912227, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8797041177749634, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8678334951400757, + "num_tokens": 839506147.0, + "step": 22000 + }, + { + "epoch": 2.7987533392698127, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9000530242919922, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8575192093849182, + "num_tokens": 839544620.0, + "step": 22001 + }, + { + "epoch": 2.7988805495484037, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8561631441116333, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8558450937271118, + "num_tokens": 839583217.0, + "step": 22002 + }, + { + "epoch": 2.799007759826994, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9475408792495728, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8854752779006958, + "num_tokens": 839621283.0, + "step": 22003 + }, + { + "epoch": 2.7991349701055848, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.909920573234558, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8605116009712219, + "num_tokens": 839658072.0, + "step": 22004 + }, + { + "epoch": 2.799262180384175, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.771791696548462, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8484201431274414, + "num_tokens": 839700926.0, + "step": 22005 + }, + { + "epoch": 2.799389390662766, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.89743971824646, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8744107484817505, + "num_tokens": 839736160.0, + "step": 22006 + }, + { + "epoch": 2.799516600941356, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9018757343292236, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8814382553100586, + "num_tokens": 839776834.0, + "step": 22007 + }, + { + "epoch": 2.7996438112199464, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8683053255081177, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8596913814544678, + "num_tokens": 839818156.0, + "step": 22008 + }, + { + "epoch": 2.799771021498537, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7940162420272827, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8608731031417847, + "num_tokens": 839857133.0, + "step": 22009 + }, + { + "epoch": 2.7998982317771275, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8843634128570557, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8502277731895447, + "num_tokens": 839893129.0, + "step": 22010 + }, + { + "epoch": 2.800025442055718, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.005164623260498, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8683452010154724, + "num_tokens": 839924622.0, + "step": 22011 + }, + { + "epoch": 2.8001526523343085, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.884118676185608, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8834173679351807, + "num_tokens": 839961276.0, + "step": 22012 + }, + { + "epoch": 2.800279862612899, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8360161781311035, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8555402755737305, + "num_tokens": 840002554.0, + "step": 22013 + }, + { + "epoch": 2.8004070728914896, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8712562322616577, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8648298978805542, + "num_tokens": 840041642.0, + "step": 22014 + }, + { + "epoch": 2.80053428317008, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7220696210861206, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8735752701759338, + "num_tokens": 840083532.0, + "step": 22015 + }, + { + "epoch": 2.8006614934486707, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8325624465942383, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8628257513046265, + "num_tokens": 840124221.0, + "step": 22016 + }, + { + "epoch": 2.800788703727261, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.815055012702942, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8704137802124023, + "num_tokens": 840162309.0, + "step": 22017 + }, + { + "epoch": 2.8009159140058517, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.012728691101074, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.866966962814331, + "num_tokens": 840194778.0, + "step": 22018 + }, + { + "epoch": 2.8010431242844422, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.041086435317993, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8812402486801147, + "num_tokens": 840225081.0, + "step": 22019 + }, + { + "epoch": 2.8011703345630328, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8173038959503174, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.864783763885498, + "num_tokens": 840267698.0, + "step": 22020 + }, + { + "epoch": 2.8012975448416233, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9517430067062378, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.858468234539032, + "num_tokens": 840303255.0, + "step": 22021 + }, + { + "epoch": 2.801424755120214, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9563685655593872, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8640871047973633, + "num_tokens": 840344616.0, + "step": 22022 + }, + { + "epoch": 2.8015519653988044, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.027900457382202, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8670816421508789, + "num_tokens": 840383293.0, + "step": 22023 + }, + { + "epoch": 2.801679175677395, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9503724575042725, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.854192316532135, + "num_tokens": 840424583.0, + "step": 22024 + }, + { + "epoch": 2.8018063859559854, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0982799530029297, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8581734299659729, + "num_tokens": 840458752.0, + "step": 22025 + }, + { + "epoch": 2.8019335962345755, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8541138172149658, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.87117600440979, + "num_tokens": 840498441.0, + "step": 22026 + }, + { + "epoch": 2.8020608065131665, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.877816915512085, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8750886917114258, + "num_tokens": 840537838.0, + "step": 22027 + }, + { + "epoch": 2.8021880167917566, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2374322414398193, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8535211682319641, + "num_tokens": 840571770.0, + "step": 22028 + }, + { + "epoch": 2.8023152270703475, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0304386615753174, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8596825003623962, + "num_tokens": 840607559.0, + "step": 22029 + }, + { + "epoch": 2.8024424373489376, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8932604789733887, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8516544103622437, + "num_tokens": 840650156.0, + "step": 22030 + }, + { + "epoch": 2.8025696476275286, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1044747829437256, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8571567535400391, + "num_tokens": 840686590.0, + "step": 22031 + }, + { + "epoch": 2.8026968579061187, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9691089391708374, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8570463061332703, + "num_tokens": 840724445.0, + "step": 22032 + }, + { + "epoch": 2.802824068184709, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8931854963302612, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8798452615737915, + "num_tokens": 840765047.0, + "step": 22033 + }, + { + "epoch": 2.8029512784632997, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.002685546875, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8519678115844727, + "num_tokens": 840802375.0, + "step": 22034 + }, + { + "epoch": 2.8030784887418903, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8887823820114136, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8648016452789307, + "num_tokens": 840839994.0, + "step": 22035 + }, + { + "epoch": 2.803205699020481, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9290651082992554, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8629553318023682, + "num_tokens": 840877144.0, + "step": 22036 + }, + { + "epoch": 2.8033329092990713, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8165754079818726, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8561339378356934, + "num_tokens": 840918478.0, + "step": 22037 + }, + { + "epoch": 2.803460119577662, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9366251230239868, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8755039572715759, + "num_tokens": 840958484.0, + "step": 22038 + }, + { + "epoch": 2.8035873298562524, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7314424514770508, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8673102855682373, + "num_tokens": 841000035.0, + "step": 22039 + }, + { + "epoch": 2.803714540134843, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9834970235824585, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.860639214515686, + "num_tokens": 841035836.0, + "step": 22040 + }, + { + "epoch": 2.8038417504134334, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8649015426635742, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8668556213378906, + "num_tokens": 841069295.0, + "step": 22041 + }, + { + "epoch": 2.803968960692024, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9172722101211548, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8612030744552612, + "num_tokens": 841108341.0, + "step": 22042 + }, + { + "epoch": 2.8040961709706145, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0374860763549805, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8758323788642883, + "num_tokens": 841142945.0, + "step": 22043 + }, + { + "epoch": 2.804223381249205, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9789808988571167, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8789054751396179, + "num_tokens": 841174137.0, + "step": 22044 + }, + { + "epoch": 2.8043505915277955, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.924426555633545, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8758097887039185, + "num_tokens": 841215914.0, + "step": 22045 + }, + { + "epoch": 2.804477801806386, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9426590204238892, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8668427467346191, + "num_tokens": 841256097.0, + "step": 22046 + }, + { + "epoch": 2.8046050120849766, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0315539836883545, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8715428709983826, + "num_tokens": 841288997.0, + "step": 22047 + }, + { + "epoch": 2.804732222363567, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0094432830810547, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8595001697540283, + "num_tokens": 841328053.0, + "step": 22048 + }, + { + "epoch": 2.8048594326421576, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.879686713218689, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8844919204711914, + "num_tokens": 841366848.0, + "step": 22049 + }, + { + "epoch": 2.804986642920748, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9965198040008545, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8736441135406494, + "num_tokens": 841398568.0, + "step": 22050 + }, + { + "epoch": 2.8051138531993383, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9630985260009766, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8635077476501465, + "num_tokens": 841435722.0, + "step": 22051 + }, + { + "epoch": 2.8052410634779292, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9560737609863281, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8615867495536804, + "num_tokens": 841473635.0, + "step": 22052 + }, + { + "epoch": 2.8053682737565193, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9120466709136963, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8678197264671326, + "num_tokens": 841512383.0, + "step": 22053 + }, + { + "epoch": 2.8054954840351103, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9559123516082764, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8597589731216431, + "num_tokens": 841543174.0, + "step": 22054 + }, + { + "epoch": 2.8056226943137004, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.807985544204712, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8693819046020508, + "num_tokens": 841582185.0, + "step": 22055 + }, + { + "epoch": 2.805749904592291, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.902217984199524, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8596727848052979, + "num_tokens": 841623179.0, + "step": 22056 + }, + { + "epoch": 2.8058771148708814, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.956330418586731, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.873591423034668, + "num_tokens": 841661062.0, + "step": 22057 + }, + { + "epoch": 2.806004325149472, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8546128273010254, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8629242777824402, + "num_tokens": 841704259.0, + "step": 22058 + }, + { + "epoch": 2.8061315354280625, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.788277506828308, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8633261919021606, + "num_tokens": 841748396.0, + "step": 22059 + }, + { + "epoch": 2.806258745706653, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7635082006454468, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8844077587127686, + "num_tokens": 841787809.0, + "step": 22060 + }, + { + "epoch": 2.8063859559852435, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.98943293094635, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8654798865318298, + "num_tokens": 841824046.0, + "step": 22061 + }, + { + "epoch": 2.806513166263834, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.828656554222107, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8751571178436279, + "num_tokens": 841856949.0, + "step": 22062 + }, + { + "epoch": 2.8066403765424246, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9721781015396118, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.851308286190033, + "num_tokens": 841898644.0, + "step": 22063 + }, + { + "epoch": 2.806767586821015, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.163067579269409, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8693350553512573, + "num_tokens": 841934795.0, + "step": 22064 + }, + { + "epoch": 2.8068947970996057, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9689892530441284, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8497180938720703, + "num_tokens": 841976878.0, + "step": 22065 + }, + { + "epoch": 2.807022007378196, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0716614723205566, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8655434846878052, + "num_tokens": 842010998.0, + "step": 22066 + }, + { + "epoch": 2.8071492176567867, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.727881669998169, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8913425207138062, + "num_tokens": 842051410.0, + "step": 22067 + }, + { + "epoch": 2.8072764279353772, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8935303688049316, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8680181503295898, + "num_tokens": 842087954.0, + "step": 22068 + }, + { + "epoch": 2.8074036382139678, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8477643728256226, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8613022565841675, + "num_tokens": 842128326.0, + "step": 22069 + }, + { + "epoch": 2.8075308484925583, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0168938636779785, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8445789813995361, + "num_tokens": 842171143.0, + "step": 22070 + }, + { + "epoch": 2.807658058771149, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7790182828903198, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8519309759140015, + "num_tokens": 842212167.0, + "step": 22071 + }, + { + "epoch": 2.8077852690497394, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7405784130096436, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8579937219619751, + "num_tokens": 842252576.0, + "step": 22072 + }, + { + "epoch": 2.80791247932833, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.911515474319458, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8621215224266052, + "num_tokens": 842287812.0, + "step": 22073 + }, + { + "epoch": 2.80803968960692, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.891303300857544, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8618236780166626, + "num_tokens": 842328289.0, + "step": 22074 + }, + { + "epoch": 2.808166899885511, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.062354564666748, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8681132793426514, + "num_tokens": 842359182.0, + "step": 22075 + }, + { + "epoch": 2.808294110164101, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8699740171432495, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8575366735458374, + "num_tokens": 842399820.0, + "step": 22076 + }, + { + "epoch": 2.808421320442692, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0043511390686035, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8633063435554504, + "num_tokens": 842437852.0, + "step": 22077 + }, + { + "epoch": 2.808548530721282, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9189307689666748, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8515455722808838, + "num_tokens": 842476729.0, + "step": 22078 + }, + { + "epoch": 2.808675740999873, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.851172924041748, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8523973226547241, + "num_tokens": 842514590.0, + "step": 22079 + }, + { + "epoch": 2.808802951278463, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8409639596939087, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8641101121902466, + "num_tokens": 842556502.0, + "step": 22080 + }, + { + "epoch": 2.8089301615570537, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.915787696838379, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8533651828765869, + "num_tokens": 842594299.0, + "step": 22081 + }, + { + "epoch": 2.809057371835644, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8094650506973267, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8624125123023987, + "num_tokens": 842633189.0, + "step": 22082 + }, + { + "epoch": 2.8091845821142347, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8878675699234009, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8686362504959106, + "num_tokens": 842673626.0, + "step": 22083 + }, + { + "epoch": 2.8093117923928252, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.941465973854065, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8591751456260681, + "num_tokens": 842712574.0, + "step": 22084 + }, + { + "epoch": 2.8094390026714158, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9753036499023438, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8631293177604675, + "num_tokens": 842749534.0, + "step": 22085 + }, + { + "epoch": 2.8095662129500063, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0647976398468018, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.855073094367981, + "num_tokens": 842784989.0, + "step": 22086 + }, + { + "epoch": 2.809693423228597, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8203564882278442, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8660481572151184, + "num_tokens": 842823858.0, + "step": 22087 + }, + { + "epoch": 2.8098206335071874, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7990307807922363, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8648642897605896, + "num_tokens": 842858849.0, + "step": 22088 + }, + { + "epoch": 2.809947843785778, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8004357814788818, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8855113983154297, + "num_tokens": 842895809.0, + "step": 22089 + }, + { + "epoch": 2.8100750540643684, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8571985960006714, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8928512334823608, + "num_tokens": 842933650.0, + "step": 22090 + }, + { + "epoch": 2.810202264342959, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.994827389717102, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8698951601982117, + "num_tokens": 842967200.0, + "step": 22091 + }, + { + "epoch": 2.8103294746215495, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9279347658157349, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8550310134887695, + "num_tokens": 843002498.0, + "step": 22092 + }, + { + "epoch": 2.81045668490014, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8443281650543213, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8707271814346313, + "num_tokens": 843042627.0, + "step": 22093 + }, + { + "epoch": 2.8105838951787305, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2113234996795654, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8613863587379456, + "num_tokens": 843076862.0, + "step": 22094 + }, + { + "epoch": 2.810711105457321, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0384836196899414, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.864834725856781, + "num_tokens": 843115000.0, + "step": 22095 + }, + { + "epoch": 2.8108383157359116, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1207053661346436, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8718944787979126, + "num_tokens": 843152408.0, + "step": 22096 + }, + { + "epoch": 2.810965526014502, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.946628212928772, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8665912747383118, + "num_tokens": 843189269.0, + "step": 22097 + }, + { + "epoch": 2.8110927362930926, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.928395390510559, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8595722317695618, + "num_tokens": 843227148.0, + "step": 22098 + }, + { + "epoch": 2.8112199465716827, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1341392993927, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8547583222389221, + "num_tokens": 843259736.0, + "step": 22099 + }, + { + "epoch": 2.8113471568502737, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.385876417160034, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8613237142562866, + "num_tokens": 843302598.0, + "step": 22100 + }, + { + "epoch": 2.811474367128864, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9106436967849731, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8813310861587524, + "num_tokens": 843337615.0, + "step": 22101 + }, + { + "epoch": 2.8116015774074548, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8309494256973267, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8812294602394104, + "num_tokens": 843378845.0, + "step": 22102 + }, + { + "epoch": 2.811728787686045, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9405077695846558, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8548242449760437, + "num_tokens": 843414128.0, + "step": 22103 + }, + { + "epoch": 2.811855997964636, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8728879690170288, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8826098442077637, + "num_tokens": 843454255.0, + "step": 22104 + }, + { + "epoch": 2.811983208243226, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8920291662216187, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8656384944915771, + "num_tokens": 843498332.0, + "step": 22105 + }, + { + "epoch": 2.8121104185218164, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8280091285705566, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8559643030166626, + "num_tokens": 843539196.0, + "step": 22106 + }, + { + "epoch": 2.812237628800407, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.959295392036438, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8700835704803467, + "num_tokens": 843573767.0, + "step": 22107 + }, + { + "epoch": 2.8123648390789975, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9625816345214844, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8531845808029175, + "num_tokens": 843611156.0, + "step": 22108 + }, + { + "epoch": 2.812492049357588, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.084909439086914, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8708365559577942, + "num_tokens": 843644026.0, + "step": 22109 + }, + { + "epoch": 2.8126192596361785, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2963249683380127, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8693015575408936, + "num_tokens": 843683477.0, + "step": 22110 + }, + { + "epoch": 2.812746469914769, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9953533411026, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8514753580093384, + "num_tokens": 843722815.0, + "step": 22111 + }, + { + "epoch": 2.8128736801933596, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.944138765335083, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8675012588500977, + "num_tokens": 843763843.0, + "step": 22112 + }, + { + "epoch": 2.81300089047195, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8082808256149292, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8580948114395142, + "num_tokens": 843805327.0, + "step": 22113 + }, + { + "epoch": 2.8131281007505406, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0964269638061523, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8664872646331787, + "num_tokens": 843850132.0, + "step": 22114 + }, + { + "epoch": 2.813255311029131, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9538085460662842, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8664582967758179, + "num_tokens": 843887425.0, + "step": 22115 + }, + { + "epoch": 2.8133825213077217, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9647547006607056, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8736007809638977, + "num_tokens": 843920728.0, + "step": 22116 + }, + { + "epoch": 2.8135097315863122, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0781049728393555, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8626633286476135, + "num_tokens": 843959733.0, + "step": 22117 + }, + { + "epoch": 2.8136369418649028, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.898319125175476, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8667213916778564, + "num_tokens": 844005616.0, + "step": 22118 + }, + { + "epoch": 2.8137641521434933, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1684274673461914, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8800885677337646, + "num_tokens": 844034216.0, + "step": 22119 + }, + { + "epoch": 2.813891362422084, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.83088219165802, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8621442317962646, + "num_tokens": 844071173.0, + "step": 22120 + }, + { + "epoch": 2.8140185727006743, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.712483286857605, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8625350594520569, + "num_tokens": 844113665.0, + "step": 22121 + }, + { + "epoch": 2.814145782979265, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8837860822677612, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8650251626968384, + "num_tokens": 844154063.0, + "step": 22122 + }, + { + "epoch": 2.8142729932578554, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.028452157974243, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8854548931121826, + "num_tokens": 844185634.0, + "step": 22123 + }, + { + "epoch": 2.8144002035364455, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9097837209701538, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8551955223083496, + "num_tokens": 844221755.0, + "step": 22124 + }, + { + "epoch": 2.8145274138150365, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0121665000915527, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8702167272567749, + "num_tokens": 844256230.0, + "step": 22125 + }, + { + "epoch": 2.8146546240936265, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.020493745803833, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8582831025123596, + "num_tokens": 844292823.0, + "step": 22126 + }, + { + "epoch": 2.8147818343722175, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0517163276672363, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8430962562561035, + "num_tokens": 844331843.0, + "step": 22127 + }, + { + "epoch": 2.8149090446508076, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.6846792697906494, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8755532503128052, + "num_tokens": 844375823.0, + "step": 22128 + }, + { + "epoch": 2.815036254929398, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.904895305633545, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8662427067756653, + "num_tokens": 844415187.0, + "step": 22129 + }, + { + "epoch": 2.8151634652079887, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9275486469268799, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8693740367889404, + "num_tokens": 844454986.0, + "step": 22130 + }, + { + "epoch": 2.815290675486579, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9845695495605469, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8720548152923584, + "num_tokens": 844495514.0, + "step": 22131 + }, + { + "epoch": 2.8154178857651697, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.028048276901245, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.864139199256897, + "num_tokens": 844532406.0, + "step": 22132 + }, + { + "epoch": 2.8155450960437602, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9318002462387085, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8650786876678467, + "num_tokens": 844569561.0, + "step": 22133 + }, + { + "epoch": 2.8156723063223508, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8594741821289062, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8704066872596741, + "num_tokens": 844613504.0, + "step": 22134 + }, + { + "epoch": 2.8157995166009413, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0582947731018066, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8637430667877197, + "num_tokens": 844649255.0, + "step": 22135 + }, + { + "epoch": 2.815926726879532, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9483006000518799, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8670774102210999, + "num_tokens": 844687202.0, + "step": 22136 + }, + { + "epoch": 2.8160539371581224, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.063934803009033, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8669720888137817, + "num_tokens": 844723096.0, + "step": 22137 + }, + { + "epoch": 2.816181147436713, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9775665998458862, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8660599589347839, + "num_tokens": 844759754.0, + "step": 22138 + }, + { + "epoch": 2.8163083577153034, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9462400674819946, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8687366843223572, + "num_tokens": 844798372.0, + "step": 22139 + }, + { + "epoch": 2.816435567993894, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8925201892852783, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8598637580871582, + "num_tokens": 844832738.0, + "step": 22140 + }, + { + "epoch": 2.8165627782724845, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9207683801651, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.86688232421875, + "num_tokens": 844871684.0, + "step": 22141 + }, + { + "epoch": 2.816689988551075, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.012723207473755, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.85878986120224, + "num_tokens": 844912140.0, + "step": 22142 + }, + { + "epoch": 2.8168171988296655, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9535279273986816, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8537983894348145, + "num_tokens": 844948256.0, + "step": 22143 + }, + { + "epoch": 2.816944409108256, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.788851022720337, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.881175696849823, + "num_tokens": 844983821.0, + "step": 22144 + }, + { + "epoch": 2.8170716193868466, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.857843041419983, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8684505224227905, + "num_tokens": 845020622.0, + "step": 22145 + }, + { + "epoch": 2.817198829665437, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9361960887908936, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8566119074821472, + "num_tokens": 845056748.0, + "step": 22146 + }, + { + "epoch": 2.8173260399440276, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8938970565795898, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.846213698387146, + "num_tokens": 845096294.0, + "step": 22147 + }, + { + "epoch": 2.817453250222618, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.099576473236084, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8806360960006714, + "num_tokens": 845135132.0, + "step": 22148 + }, + { + "epoch": 2.8175804605012083, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9554035663604736, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8746512532234192, + "num_tokens": 845172164.0, + "step": 22149 + }, + { + "epoch": 2.8177076707797992, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9343020915985107, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8661127090454102, + "num_tokens": 845210270.0, + "step": 22150 + }, + { + "epoch": 2.8178348810583893, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8292884826660156, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8690880537033081, + "num_tokens": 845251863.0, + "step": 22151 + }, + { + "epoch": 2.8179620913369803, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9930936098098755, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8699513673782349, + "num_tokens": 845290262.0, + "step": 22152 + }, + { + "epoch": 2.8180893016155704, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9374196529388428, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8736013770103455, + "num_tokens": 845326894.0, + "step": 22153 + }, + { + "epoch": 2.818216511894161, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8813307285308838, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8748060464859009, + "num_tokens": 845367545.0, + "step": 22154 + }, + { + "epoch": 2.8183437221727514, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0324432849884033, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8510655164718628, + "num_tokens": 845400920.0, + "step": 22155 + }, + { + "epoch": 2.818470932451342, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0087435245513916, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8621269464492798, + "num_tokens": 845436279.0, + "step": 22156 + }, + { + "epoch": 2.8185981427299325, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.903800129890442, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.869941234588623, + "num_tokens": 845476308.0, + "step": 22157 + }, + { + "epoch": 2.818725353008523, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8617380857467651, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.862587571144104, + "num_tokens": 845516624.0, + "step": 22158 + }, + { + "epoch": 2.8188525632871135, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8486472368240356, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8833447694778442, + "num_tokens": 845552385.0, + "step": 22159 + }, + { + "epoch": 2.818979773565704, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.732710599899292, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8741185069084167, + "num_tokens": 845592600.0, + "step": 22160 + }, + { + "epoch": 2.8191069838442946, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.84348726272583, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8742577433586121, + "num_tokens": 845630028.0, + "step": 22161 + }, + { + "epoch": 2.819234194122885, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8729947805404663, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8612737655639648, + "num_tokens": 845665874.0, + "step": 22162 + }, + { + "epoch": 2.8193614044014756, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0057308673858643, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8538890480995178, + "num_tokens": 845702738.0, + "step": 22163 + }, + { + "epoch": 2.819488614680066, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7676351070404053, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8703877329826355, + "num_tokens": 845744785.0, + "step": 22164 + }, + { + "epoch": 2.8196158249586567, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9777604341506958, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8663860559463501, + "num_tokens": 845779196.0, + "step": 22165 + }, + { + "epoch": 2.8197430352372472, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9982551336288452, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8540250062942505, + "num_tokens": 845819016.0, + "step": 22166 + }, + { + "epoch": 2.8198702455158378, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.057314157485962, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8687942028045654, + "num_tokens": 845855014.0, + "step": 22167 + }, + { + "epoch": 2.8199974557944283, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8556658029556274, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8745179176330566, + "num_tokens": 845895491.0, + "step": 22168 + }, + { + "epoch": 2.820124666073019, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9538426399230957, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8567137122154236, + "num_tokens": 845931560.0, + "step": 22169 + }, + { + "epoch": 2.8202518763516093, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.4991116523742676, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8662549257278442, + "num_tokens": 845966701.0, + "step": 22170 + }, + { + "epoch": 2.8203790866302, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8788522481918335, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8840669989585876, + "num_tokens": 846002327.0, + "step": 22171 + }, + { + "epoch": 2.82050629690879, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8033939599990845, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8669499754905701, + "num_tokens": 846042157.0, + "step": 22172 + }, + { + "epoch": 2.820633507187381, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.87999427318573, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8520674705505371, + "num_tokens": 846082851.0, + "step": 22173 + }, + { + "epoch": 2.820760717465971, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9834424257278442, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8641948699951172, + "num_tokens": 846116846.0, + "step": 22174 + }, + { + "epoch": 2.820887927744562, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1922121047973633, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8685869574546814, + "num_tokens": 846154373.0, + "step": 22175 + }, + { + "epoch": 2.821015138023152, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6853611469268799, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8842310905456543, + "num_tokens": 846195329.0, + "step": 22176 + }, + { + "epoch": 2.821142348301743, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9331326484680176, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8759404420852661, + "num_tokens": 846233078.0, + "step": 22177 + }, + { + "epoch": 2.821269558580333, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8860201835632324, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8655219078063965, + "num_tokens": 846269030.0, + "step": 22178 + }, + { + "epoch": 2.8213967688589237, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8411741256713867, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8612393140792847, + "num_tokens": 846307269.0, + "step": 22179 + }, + { + "epoch": 2.821523979137514, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0783700942993164, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8567941188812256, + "num_tokens": 846339116.0, + "step": 22180 + }, + { + "epoch": 2.8216511894161047, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9401109218597412, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8598254323005676, + "num_tokens": 846384968.0, + "step": 22181 + }, + { + "epoch": 2.8217783996946952, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8470231294631958, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8502053022384644, + "num_tokens": 846426235.0, + "step": 22182 + }, + { + "epoch": 2.8219056099732858, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9008290767669678, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8533985018730164, + "num_tokens": 846465715.0, + "step": 22183 + }, + { + "epoch": 2.8220328202518763, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9995864629745483, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8524872064590454, + "num_tokens": 846499972.0, + "step": 22184 + }, + { + "epoch": 2.822160030530467, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9915233850479126, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8647255897521973, + "num_tokens": 846535906.0, + "step": 22185 + }, + { + "epoch": 2.8222872408090574, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0311789512634277, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8593829870223999, + "num_tokens": 846572723.0, + "step": 22186 + }, + { + "epoch": 2.822414451087648, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9387309551239014, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.869261622428894, + "num_tokens": 846613501.0, + "step": 22187 + }, + { + "epoch": 2.8225416613662384, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 80.52090454101562, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8866548538208008, + "num_tokens": 846650001.0, + "step": 22188 + }, + { + "epoch": 2.822668871644829, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9924283027648926, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8529874086380005, + "num_tokens": 846690636.0, + "step": 22189 + }, + { + "epoch": 2.8227960819234195, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.4615278244018555, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.844663679599762, + "num_tokens": 846728513.0, + "step": 22190 + }, + { + "epoch": 2.82292329220201, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.008838415145874, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8510776162147522, + "num_tokens": 846771989.0, + "step": 22191 + }, + { + "epoch": 2.8230505024806005, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8079298734664917, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8827834129333496, + "num_tokens": 846808393.0, + "step": 22192 + }, + { + "epoch": 2.823177712759191, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7586915493011475, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8570941090583801, + "num_tokens": 846846928.0, + "step": 22193 + }, + { + "epoch": 2.8233049230377816, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0125157833099365, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8551737070083618, + "num_tokens": 846884637.0, + "step": 22194 + }, + { + "epoch": 2.823432133316372, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9933589696884155, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8699564933776855, + "num_tokens": 846925749.0, + "step": 22195 + }, + { + "epoch": 2.8235593435949626, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9920103549957275, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8498196601867676, + "num_tokens": 846967191.0, + "step": 22196 + }, + { + "epoch": 2.8236865538735527, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.944467306137085, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8710829615592957, + "num_tokens": 847003280.0, + "step": 22197 + }, + { + "epoch": 2.8238137641521437, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.004568099975586, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8531156778335571, + "num_tokens": 847042179.0, + "step": 22198 + }, + { + "epoch": 2.8239409744307338, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8975629806518555, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8620679974555969, + "num_tokens": 847083008.0, + "step": 22199 + }, + { + "epoch": 2.8240681847093247, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9871013164520264, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8440381288528442, + "num_tokens": 847124009.0, + "step": 22200 + }, + { + "epoch": 2.824195394987915, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9180364608764648, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8594636917114258, + "num_tokens": 847163526.0, + "step": 22201 + }, + { + "epoch": 2.824322605266506, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.084059715270996, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8645113706588745, + "num_tokens": 847201374.0, + "step": 22202 + }, + { + "epoch": 2.824449815545096, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.010514974594116, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.849750280380249, + "num_tokens": 847239469.0, + "step": 22203 + }, + { + "epoch": 2.8245770258236864, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.934930443763733, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8632134199142456, + "num_tokens": 847275964.0, + "step": 22204 + }, + { + "epoch": 2.824704236102277, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8199907541275024, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8664546012878418, + "num_tokens": 847316815.0, + "step": 22205 + }, + { + "epoch": 2.8248314463808675, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0333058834075928, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8691407442092896, + "num_tokens": 847350949.0, + "step": 22206 + }, + { + "epoch": 2.824958656659458, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8235044479370117, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8695648908615112, + "num_tokens": 847388945.0, + "step": 22207 + }, + { + "epoch": 2.8250858669380485, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.966849684715271, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8776126503944397, + "num_tokens": 847435114.0, + "step": 22208 + }, + { + "epoch": 2.825213077216639, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.023231267929077, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8568360805511475, + "num_tokens": 847470815.0, + "step": 22209 + }, + { + "epoch": 2.8253402874952296, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.954007625579834, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8625866174697876, + "num_tokens": 847509379.0, + "step": 22210 + }, + { + "epoch": 2.82546749777382, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8208929300308228, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.868476390838623, + "num_tokens": 847547427.0, + "step": 22211 + }, + { + "epoch": 2.8255947080524106, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9533250331878662, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8479750752449036, + "num_tokens": 847584740.0, + "step": 22212 + }, + { + "epoch": 2.825721918331001, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8992079496383667, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8549100160598755, + "num_tokens": 847626201.0, + "step": 22213 + }, + { + "epoch": 2.8258491286095917, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8452290296554565, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8796444535255432, + "num_tokens": 847661311.0, + "step": 22214 + }, + { + "epoch": 2.8259763388881822, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8597768545150757, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8642714619636536, + "num_tokens": 847695234.0, + "step": 22215 + }, + { + "epoch": 2.8261035491667728, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7731258869171143, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8828701972961426, + "num_tokens": 847738848.0, + "step": 22216 + }, + { + "epoch": 2.8262307594453633, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9710216522216797, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8682866096496582, + "num_tokens": 847775952.0, + "step": 22217 + }, + { + "epoch": 2.826357969723954, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.995975375175476, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8809642791748047, + "num_tokens": 847809381.0, + "step": 22218 + }, + { + "epoch": 2.8264851800025443, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9213837385177612, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8629076480865479, + "num_tokens": 847849835.0, + "step": 22219 + }, + { + "epoch": 2.826612390281135, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9834060668945312, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8630737066268921, + "num_tokens": 847882726.0, + "step": 22220 + }, + { + "epoch": 2.8267396005597254, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8840856552124023, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8661866188049316, + "num_tokens": 847920663.0, + "step": 22221 + }, + { + "epoch": 2.8268668108383155, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9480973482131958, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8708423972129822, + "num_tokens": 847962204.0, + "step": 22222 + }, + { + "epoch": 2.8269940211169065, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9227142333984375, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8566097021102905, + "num_tokens": 848003722.0, + "step": 22223 + }, + { + "epoch": 2.8271212313954965, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.897286057472229, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8743191957473755, + "num_tokens": 848045335.0, + "step": 22224 + }, + { + "epoch": 2.8272484416740875, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8140877485275269, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8572028875350952, + "num_tokens": 848088817.0, + "step": 22225 + }, + { + "epoch": 2.8273756519526776, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9048734903335571, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8565906286239624, + "num_tokens": 848120993.0, + "step": 22226 + }, + { + "epoch": 2.827502862231268, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.613231658935547, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8554084300994873, + "num_tokens": 848162542.0, + "step": 22227 + }, + { + "epoch": 2.8276300725098586, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2010021209716797, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8775894641876221, + "num_tokens": 848198194.0, + "step": 22228 + }, + { + "epoch": 2.827757282788449, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.993796467781067, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8600629568099976, + "num_tokens": 848240575.0, + "step": 22229 + }, + { + "epoch": 2.8278844930670397, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8706706762313843, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8651952743530273, + "num_tokens": 848289026.0, + "step": 22230 + }, + { + "epoch": 2.8280117033456302, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9391213655471802, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8668791055679321, + "num_tokens": 848326438.0, + "step": 22231 + }, + { + "epoch": 2.8281389136242208, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8309056758880615, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8718100786209106, + "num_tokens": 848368106.0, + "step": 22232 + }, + { + "epoch": 2.8282661239028113, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.903516411781311, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.862876296043396, + "num_tokens": 848409775.0, + "step": 22233 + }, + { + "epoch": 2.828393334181402, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.5917909145355225, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8659584522247314, + "num_tokens": 848442149.0, + "step": 22234 + }, + { + "epoch": 2.8285205444599923, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9957975149154663, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8598631620407104, + "num_tokens": 848475468.0, + "step": 22235 + }, + { + "epoch": 2.828647754738583, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9711638689041138, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8659391403198242, + "num_tokens": 848509565.0, + "step": 22236 + }, + { + "epoch": 2.8287749650171734, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.847391128540039, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8670177459716797, + "num_tokens": 848546357.0, + "step": 22237 + }, + { + "epoch": 2.828902175295764, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8131046295166016, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.853662371635437, + "num_tokens": 848587661.0, + "step": 22238 + }, + { + "epoch": 2.8290293855743545, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8646785020828247, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8686153888702393, + "num_tokens": 848628250.0, + "step": 22239 + }, + { + "epoch": 2.829156595852945, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.805397868156433, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8652213215827942, + "num_tokens": 848670452.0, + "step": 22240 + }, + { + "epoch": 2.8292838061315355, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9697233438491821, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8667791485786438, + "num_tokens": 848703592.0, + "step": 22241 + }, + { + "epoch": 2.829411016410126, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9072794914245605, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.849204421043396, + "num_tokens": 848739291.0, + "step": 22242 + }, + { + "epoch": 2.8295382266887166, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.106055736541748, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8582603931427002, + "num_tokens": 848776039.0, + "step": 22243 + }, + { + "epoch": 2.829665436967307, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9982677698135376, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8685714602470398, + "num_tokens": 848814799.0, + "step": 22244 + }, + { + "epoch": 2.8297926472458976, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8525211811065674, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.86667799949646, + "num_tokens": 848860123.0, + "step": 22245 + }, + { + "epoch": 2.829919857524488, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7250831127166748, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8816689848899841, + "num_tokens": 848900447.0, + "step": 22246 + }, + { + "epoch": 2.8300470678030782, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8393263816833496, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8587170839309692, + "num_tokens": 848945385.0, + "step": 22247 + }, + { + "epoch": 2.830174278081669, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.631635665893555, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8547191619873047, + "num_tokens": 848978783.0, + "step": 22248 + }, + { + "epoch": 2.8303014883602593, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1718051433563232, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8528745174407959, + "num_tokens": 849017324.0, + "step": 22249 + }, + { + "epoch": 2.8304286986388503, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2226693630218506, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.886646032333374, + "num_tokens": 849045928.0, + "step": 22250 + }, + { + "epoch": 2.8305559089174404, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.884168267250061, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8648507595062256, + "num_tokens": 849080855.0, + "step": 22251 + }, + { + "epoch": 2.830683119196031, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0299155712127686, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8688237071037292, + "num_tokens": 849114835.0, + "step": 22252 + }, + { + "epoch": 2.8308103294746214, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0268404483795166, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8542705774307251, + "num_tokens": 849152632.0, + "step": 22253 + }, + { + "epoch": 2.830937539753212, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.254870653152466, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8724672794342041, + "num_tokens": 849189911.0, + "step": 22254 + }, + { + "epoch": 2.8310647500318025, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.960320234298706, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8623201847076416, + "num_tokens": 849227847.0, + "step": 22255 + }, + { + "epoch": 2.831191960310393, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9604600667953491, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8707191944122314, + "num_tokens": 849265778.0, + "step": 22256 + }, + { + "epoch": 2.8313191705889835, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9369442462921143, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8640192747116089, + "num_tokens": 849302488.0, + "step": 22257 + }, + { + "epoch": 2.831446380867574, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.216313362121582, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8809330463409424, + "num_tokens": 849337483.0, + "step": 22258 + }, + { + "epoch": 2.8315735911461646, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7919291257858276, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8711909651756287, + "num_tokens": 849378491.0, + "step": 22259 + }, + { + "epoch": 2.831700801424755, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9058289527893066, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8719199895858765, + "num_tokens": 849418897.0, + "step": 22260 + }, + { + "epoch": 2.8318280117033456, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0331926345825195, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8685580492019653, + "num_tokens": 849457057.0, + "step": 22261 + }, + { + "epoch": 2.831955221981936, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9796501398086548, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.868494987487793, + "num_tokens": 849488825.0, + "step": 22262 + }, + { + "epoch": 2.8320824322605267, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9800053834915161, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8733810186386108, + "num_tokens": 849528336.0, + "step": 22263 + }, + { + "epoch": 2.8322096425391172, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.945818305015564, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8797738552093506, + "num_tokens": 849565373.0, + "step": 22264 + }, + { + "epoch": 2.8323368528177078, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9272000789642334, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8769775629043579, + "num_tokens": 849600909.0, + "step": 22265 + }, + { + "epoch": 2.8324640630962983, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8909692764282227, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8570078611373901, + "num_tokens": 849639499.0, + "step": 22266 + }, + { + "epoch": 2.832591273374889, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8409382104873657, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8760992288589478, + "num_tokens": 849677232.0, + "step": 22267 + }, + { + "epoch": 2.8327184836534793, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8571845293045044, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8719160556793213, + "num_tokens": 849717187.0, + "step": 22268 + }, + { + "epoch": 2.83284569393207, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7029874324798584, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8747861981391907, + "num_tokens": 849756545.0, + "step": 22269 + }, + { + "epoch": 2.83297290421066, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.336531400680542, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8681947588920593, + "num_tokens": 849788670.0, + "step": 22270 + }, + { + "epoch": 2.833100114489251, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.011916399002075, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8658657073974609, + "num_tokens": 849825715.0, + "step": 22271 + }, + { + "epoch": 2.833227324767841, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2320337295532227, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8721833229064941, + "num_tokens": 849856435.0, + "step": 22272 + }, + { + "epoch": 2.833354535046432, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8151601552963257, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8808907270431519, + "num_tokens": 849895634.0, + "step": 22273 + }, + { + "epoch": 2.833481745325022, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8769652843475342, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8447482585906982, + "num_tokens": 849938646.0, + "step": 22274 + }, + { + "epoch": 2.833608955603613, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8553390502929688, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8634530901908875, + "num_tokens": 849978642.0, + "step": 22275 + }, + { + "epoch": 2.833736165882203, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.91028892993927, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8645012974739075, + "num_tokens": 850017820.0, + "step": 22276 + }, + { + "epoch": 2.8338633761607936, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.031237840652466, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8480817675590515, + "num_tokens": 850056619.0, + "step": 22277 + }, + { + "epoch": 2.833990586439384, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9000223875045776, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8626724481582642, + "num_tokens": 850092456.0, + "step": 22278 + }, + { + "epoch": 2.8341177967179747, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9026411771774292, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8624864220619202, + "num_tokens": 850131598.0, + "step": 22279 + }, + { + "epoch": 2.8342450069965652, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8596560955047607, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8573517203330994, + "num_tokens": 850170884.0, + "step": 22280 + }, + { + "epoch": 2.8343722172751558, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9788480997085571, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8622821569442749, + "num_tokens": 850206989.0, + "step": 22281 + }, + { + "epoch": 2.8344994275537463, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8049131631851196, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8624972105026245, + "num_tokens": 850248602.0, + "step": 22282 + }, + { + "epoch": 2.834626637832337, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8894538879394531, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8561151027679443, + "num_tokens": 850289790.0, + "step": 22283 + }, + { + "epoch": 2.8347538481109273, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8493812084197998, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8730341196060181, + "num_tokens": 850326688.0, + "step": 22284 + }, + { + "epoch": 2.834881058389518, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7679643630981445, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8773563504219055, + "num_tokens": 850366460.0, + "step": 22285 + }, + { + "epoch": 2.8350082686681084, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8968994617462158, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8721621036529541, + "num_tokens": 850400046.0, + "step": 22286 + }, + { + "epoch": 2.835135478946699, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.01955509185791, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8560500144958496, + "num_tokens": 850434374.0, + "step": 22287 + }, + { + "epoch": 2.8352626892252895, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.145310878753662, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8676437139511108, + "num_tokens": 850469056.0, + "step": 22288 + }, + { + "epoch": 2.83538989950388, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9907487630844116, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.863737940788269, + "num_tokens": 850505805.0, + "step": 22289 + }, + { + "epoch": 2.8355171097824705, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.856671929359436, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8538991808891296, + "num_tokens": 850550056.0, + "step": 22290 + }, + { + "epoch": 2.835644320061061, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9263325929641724, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8736843466758728, + "num_tokens": 850585075.0, + "step": 22291 + }, + { + "epoch": 2.8357715303396516, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7440619468688965, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8759281635284424, + "num_tokens": 850625758.0, + "step": 22292 + }, + { + "epoch": 2.835898740618242, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8199095726013184, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8792836666107178, + "num_tokens": 850664608.0, + "step": 22293 + }, + { + "epoch": 2.8360259508968326, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8276983499526978, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8743412494659424, + "num_tokens": 850705897.0, + "step": 22294 + }, + { + "epoch": 2.8361531611754227, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.840284824371338, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.859705924987793, + "num_tokens": 850746195.0, + "step": 22295 + }, + { + "epoch": 2.8362803714540137, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8396434783935547, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8727903366088867, + "num_tokens": 850785183.0, + "step": 22296 + }, + { + "epoch": 2.8364075817326038, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9747332334518433, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8717827200889587, + "num_tokens": 850819558.0, + "step": 22297 + }, + { + "epoch": 2.8365347920111947, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9564093351364136, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8558886647224426, + "num_tokens": 850857385.0, + "step": 22298 + }, + { + "epoch": 2.836662002289785, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9171180725097656, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8609538078308105, + "num_tokens": 850895297.0, + "step": 22299 + }, + { + "epoch": 2.836789212568376, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.050945281982422, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8655404448509216, + "num_tokens": 850930982.0, + "step": 22300 + }, + { + "epoch": 2.836916422846966, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9441417455673218, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8619592189788818, + "num_tokens": 850969032.0, + "step": 22301 + }, + { + "epoch": 2.8370436331255564, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9229718446731567, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8691321611404419, + "num_tokens": 851008325.0, + "step": 22302 + }, + { + "epoch": 2.837170843404147, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8847190141677856, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8684864044189453, + "num_tokens": 851044289.0, + "step": 22303 + }, + { + "epoch": 2.8372980536827375, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.859257698059082, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8713986277580261, + "num_tokens": 851079887.0, + "step": 22304 + }, + { + "epoch": 2.837425263961328, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9548544883728027, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8688508868217468, + "num_tokens": 851124535.0, + "step": 22305 + }, + { + "epoch": 2.8375524742399185, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0178768634796143, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8760665655136108, + "num_tokens": 851162982.0, + "step": 22306 + }, + { + "epoch": 2.837679684518509, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.780693531036377, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8737574815750122, + "num_tokens": 851201864.0, + "step": 22307 + }, + { + "epoch": 2.8378068947970996, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8154704570770264, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8691047430038452, + "num_tokens": 851241659.0, + "step": 22308 + }, + { + "epoch": 2.83793410507569, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.418240547180176, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8567166328430176, + "num_tokens": 851278861.0, + "step": 22309 + }, + { + "epoch": 2.8380613153542806, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9657747745513916, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8670353889465332, + "num_tokens": 851314442.0, + "step": 22310 + }, + { + "epoch": 2.838188525632871, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8026254177093506, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8609201908111572, + "num_tokens": 851356322.0, + "step": 22311 + }, + { + "epoch": 2.8383157359114617, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7744899988174438, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8545531034469604, + "num_tokens": 851395374.0, + "step": 22312 + }, + { + "epoch": 2.838442946190052, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.959960699081421, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8649811148643494, + "num_tokens": 851430957.0, + "step": 22313 + }, + { + "epoch": 2.8385701564686427, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3390650749206543, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8616065979003906, + "num_tokens": 851474357.0, + "step": 22314 + }, + { + "epoch": 2.8386973667472333, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9552052021026611, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8790281414985657, + "num_tokens": 851510346.0, + "step": 22315 + }, + { + "epoch": 2.838824577025824, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8840419054031372, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8687100410461426, + "num_tokens": 851546798.0, + "step": 22316 + }, + { + "epoch": 2.8389517873044143, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2662546634674072, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8691815733909607, + "num_tokens": 851587819.0, + "step": 22317 + }, + { + "epoch": 2.839078997583005, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.73996901512146, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.871421754360199, + "num_tokens": 851628764.0, + "step": 22318 + }, + { + "epoch": 2.8392062078615954, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7648487091064453, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8529535531997681, + "num_tokens": 851672206.0, + "step": 22319 + }, + { + "epoch": 2.8393334181401855, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.786628246307373, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8599989414215088, + "num_tokens": 851709969.0, + "step": 22320 + }, + { + "epoch": 2.8394606284187764, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9348759651184082, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8584299683570862, + "num_tokens": 851749015.0, + "step": 22321 + }, + { + "epoch": 2.8395878386973665, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9636496305465698, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8706978559494019, + "num_tokens": 851787210.0, + "step": 22322 + }, + { + "epoch": 2.8397150489759575, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9810916185379028, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8596178889274597, + "num_tokens": 851824677.0, + "step": 22323 + }, + { + "epoch": 2.8398422592545476, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9090970754623413, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8721492290496826, + "num_tokens": 851863614.0, + "step": 22324 + }, + { + "epoch": 2.839969469533138, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8589305877685547, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.86079341173172, + "num_tokens": 851908153.0, + "step": 22325 + }, + { + "epoch": 2.8400966798117286, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8705830574035645, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8624842762947083, + "num_tokens": 851946473.0, + "step": 22326 + }, + { + "epoch": 2.840223890090319, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7750818729400635, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8557716608047485, + "num_tokens": 851991600.0, + "step": 22327 + }, + { + "epoch": 2.8403511003689097, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8818515539169312, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8672809600830078, + "num_tokens": 852029468.0, + "step": 22328 + }, + { + "epoch": 2.8404783106475002, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.07851243019104, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8713446855545044, + "num_tokens": 852061400.0, + "step": 22329 + }, + { + "epoch": 2.8406055209260908, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9465312957763672, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8660593628883362, + "num_tokens": 852100596.0, + "step": 22330 + }, + { + "epoch": 2.8407327312046813, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7672817707061768, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8688004612922668, + "num_tokens": 852140320.0, + "step": 22331 + }, + { + "epoch": 2.840859941483272, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8062082529067993, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8643426299095154, + "num_tokens": 852175969.0, + "step": 22332 + }, + { + "epoch": 2.8409871517618623, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 7.794683933258057, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8517272472381592, + "num_tokens": 852210816.0, + "step": 22333 + }, + { + "epoch": 2.841114362040453, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.645925521850586, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8650134801864624, + "num_tokens": 852246684.0, + "step": 22334 + }, + { + "epoch": 2.8412415723190434, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9811735153198242, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8696010112762451, + "num_tokens": 852285283.0, + "step": 22335 + }, + { + "epoch": 2.841368782597634, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.977920413017273, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8725504875183105, + "num_tokens": 852323310.0, + "step": 22336 + }, + { + "epoch": 2.8414959928762245, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8169422149658203, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.869086742401123, + "num_tokens": 852360850.0, + "step": 22337 + }, + { + "epoch": 2.841623203154815, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8395459651947021, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8597829937934875, + "num_tokens": 852399320.0, + "step": 22338 + }, + { + "epoch": 2.8417504134334055, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0223324298858643, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8624670505523682, + "num_tokens": 852430980.0, + "step": 22339 + }, + { + "epoch": 2.841877623711996, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9037587642669678, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8769372701644897, + "num_tokens": 852465688.0, + "step": 22340 + }, + { + "epoch": 2.8420048339905866, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8227689266204834, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8640568256378174, + "num_tokens": 852504372.0, + "step": 22341 + }, + { + "epoch": 2.842132044269177, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8998194932937622, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.870278000831604, + "num_tokens": 852542232.0, + "step": 22342 + }, + { + "epoch": 2.8422592545477676, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9055461883544922, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8576765656471252, + "num_tokens": 852580005.0, + "step": 22343 + }, + { + "epoch": 2.842386464826358, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8060340881347656, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8782963752746582, + "num_tokens": 852619912.0, + "step": 22344 + }, + { + "epoch": 2.8425136751049482, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8514312505722046, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8682589530944824, + "num_tokens": 852654349.0, + "step": 22345 + }, + { + "epoch": 2.842640885383539, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 16.61963653564453, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8579957485198975, + "num_tokens": 852691300.0, + "step": 22346 + }, + { + "epoch": 2.8427680956621293, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 7.781073570251465, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8645747900009155, + "num_tokens": 852731228.0, + "step": 22347 + }, + { + "epoch": 2.8428953059407203, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0367817878723145, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8640278577804565, + "num_tokens": 852768333.0, + "step": 22348 + }, + { + "epoch": 2.8430225162193103, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0749619007110596, + "learning_rate": 1e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8331969976425171, + "num_tokens": 852810962.0, + "step": 22349 + }, + { + "epoch": 2.843149726497901, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9260880947113037, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8820996284484863, + "num_tokens": 852844326.0, + "step": 22350 + }, + { + "epoch": 2.8432769367764914, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0162668228149414, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8595210909843445, + "num_tokens": 852877737.0, + "step": 22351 + }, + { + "epoch": 2.843404147055082, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2796640396118164, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8725165128707886, + "num_tokens": 852922085.0, + "step": 22352 + }, + { + "epoch": 2.8435313573336725, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7999114990234375, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8826766014099121, + "num_tokens": 852960742.0, + "step": 22353 + }, + { + "epoch": 2.843658567612263, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.910478115081787, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8702728748321533, + "num_tokens": 852993502.0, + "step": 22354 + }, + { + "epoch": 2.8437857778908535, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7299152612686157, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8718372583389282, + "num_tokens": 853036844.0, + "step": 22355 + }, + { + "epoch": 2.843912988169444, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.013240098953247, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8648321628570557, + "num_tokens": 853070851.0, + "step": 22356 + }, + { + "epoch": 2.8440401984480346, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8747434616088867, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8646537065505981, + "num_tokens": 853106116.0, + "step": 22357 + }, + { + "epoch": 2.844167408726625, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6224662065505981, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8647712469100952, + "num_tokens": 853152776.0, + "step": 22358 + }, + { + "epoch": 2.8442946190052156, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.92911696434021, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8651280403137207, + "num_tokens": 853194176.0, + "step": 22359 + }, + { + "epoch": 2.844421829283806, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9147324562072754, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8635919094085693, + "num_tokens": 853236834.0, + "step": 22360 + }, + { + "epoch": 2.8445490395623967, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.312697410583496, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8569865226745605, + "num_tokens": 853274145.0, + "step": 22361 + }, + { + "epoch": 2.844676249840987, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.03261661529541, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8674672842025757, + "num_tokens": 853312569.0, + "step": 22362 + }, + { + "epoch": 2.8448034601195777, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.192462205886841, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8713207244873047, + "num_tokens": 853351742.0, + "step": 22363 + }, + { + "epoch": 2.8449306703981683, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.809646487236023, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8670942783355713, + "num_tokens": 853389607.0, + "step": 22364 + }, + { + "epoch": 2.845057880676759, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8543059825897217, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8727303743362427, + "num_tokens": 853427218.0, + "step": 22365 + }, + { + "epoch": 2.8451850909553493, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8254761695861816, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8702988624572754, + "num_tokens": 853464773.0, + "step": 22366 + }, + { + "epoch": 2.84531230123394, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7686034440994263, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8660677075386047, + "num_tokens": 853507250.0, + "step": 22367 + }, + { + "epoch": 2.84543951151253, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7653006315231323, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8687731027603149, + "num_tokens": 853551947.0, + "step": 22368 + }, + { + "epoch": 2.845566721791121, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9807264804840088, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8570082187652588, + "num_tokens": 853590557.0, + "step": 22369 + }, + { + "epoch": 2.845693932069711, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2078311443328857, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.859713077545166, + "num_tokens": 853630870.0, + "step": 22370 + }, + { + "epoch": 2.845821142348302, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.052913188934326, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8621656894683838, + "num_tokens": 853662360.0, + "step": 22371 + }, + { + "epoch": 2.845948352626892, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.898689866065979, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8612421154975891, + "num_tokens": 853703661.0, + "step": 22372 + }, + { + "epoch": 2.846075562905483, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9688540697097778, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8547523021697998, + "num_tokens": 853742398.0, + "step": 22373 + }, + { + "epoch": 2.846202773184073, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9237215518951416, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8725619912147522, + "num_tokens": 853781479.0, + "step": 22374 + }, + { + "epoch": 2.8463299834626636, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.872691035270691, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8411389589309692, + "num_tokens": 853822330.0, + "step": 22375 + }, + { + "epoch": 2.846457193741254, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.86831533908844, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8521618843078613, + "num_tokens": 853862747.0, + "step": 22376 + }, + { + "epoch": 2.8465844040198447, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8543014526367188, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8897308111190796, + "num_tokens": 853897124.0, + "step": 22377 + }, + { + "epoch": 2.8467116142984352, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0525171756744385, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8534629344940186, + "num_tokens": 853930516.0, + "step": 22378 + }, + { + "epoch": 2.8468388245770258, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9779484272003174, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8715760707855225, + "num_tokens": 853967016.0, + "step": 22379 + }, + { + "epoch": 2.8469660348556163, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9746811389923096, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8506624698638916, + "num_tokens": 854008804.0, + "step": 22380 + }, + { + "epoch": 2.847093245134207, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0674102306365967, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8533285856246948, + "num_tokens": 854048393.0, + "step": 22381 + }, + { + "epoch": 2.8472204554127973, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.81722092628479, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8720290660858154, + "num_tokens": 854089480.0, + "step": 22382 + }, + { + "epoch": 2.847347665691388, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.927100658416748, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8762844204902649, + "num_tokens": 854125677.0, + "step": 22383 + }, + { + "epoch": 2.8474748759699784, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.138948917388916, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.860718846321106, + "num_tokens": 854160485.0, + "step": 22384 + }, + { + "epoch": 2.847602086248569, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.942750334739685, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8577233552932739, + "num_tokens": 854200635.0, + "step": 22385 + }, + { + "epoch": 2.8477292965271594, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.794787883758545, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8702548742294312, + "num_tokens": 854240046.0, + "step": 22386 + }, + { + "epoch": 2.84785650680575, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.832736849784851, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8767514824867249, + "num_tokens": 854275880.0, + "step": 22387 + }, + { + "epoch": 2.8479837170843405, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9433797597885132, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8673108816146851, + "num_tokens": 854311204.0, + "step": 22388 + }, + { + "epoch": 2.848110927362931, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9799245595932007, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8625966310501099, + "num_tokens": 854346076.0, + "step": 22389 + }, + { + "epoch": 2.8482381376415216, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7918245792388916, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8732401728630066, + "num_tokens": 854383075.0, + "step": 22390 + }, + { + "epoch": 2.848365347920112, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.980910062789917, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.857029378414154, + "num_tokens": 854416011.0, + "step": 22391 + }, + { + "epoch": 2.8484925581987026, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9001191854476929, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8680105209350586, + "num_tokens": 854449520.0, + "step": 22392 + }, + { + "epoch": 2.8486197684772927, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0086467266082764, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8629041314125061, + "num_tokens": 854484355.0, + "step": 22393 + }, + { + "epoch": 2.8487469787558837, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8039517402648926, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8513434529304504, + "num_tokens": 854526238.0, + "step": 22394 + }, + { + "epoch": 2.8488741890344738, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9396530389785767, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8665153384208679, + "num_tokens": 854562619.0, + "step": 22395 + }, + { + "epoch": 2.8490013993130647, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0406317710876465, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8695145845413208, + "num_tokens": 854604974.0, + "step": 22396 + }, + { + "epoch": 2.849128609591655, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7766139507293701, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8741446733474731, + "num_tokens": 854644066.0, + "step": 22397 + }, + { + "epoch": 2.849255819870246, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.773309350013733, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8686772584915161, + "num_tokens": 854682723.0, + "step": 22398 + }, + { + "epoch": 2.849383030148836, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9221638441085815, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8748331069946289, + "num_tokens": 854720292.0, + "step": 22399 + }, + { + "epoch": 2.8495102404274264, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8961679935455322, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8621549010276794, + "num_tokens": 854756917.0, + "step": 22400 + }, + { + "epoch": 2.849637450706017, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7952824831008911, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8660132884979248, + "num_tokens": 854798771.0, + "step": 22401 + }, + { + "epoch": 2.8497646609846075, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7660763263702393, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8637901544570923, + "num_tokens": 854838193.0, + "step": 22402 + }, + { + "epoch": 2.849891871263198, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0218751430511475, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8593407869338989, + "num_tokens": 854871650.0, + "step": 22403 + }, + { + "epoch": 2.8500190815417885, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.949968934059143, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8637966513633728, + "num_tokens": 854910757.0, + "step": 22404 + }, + { + "epoch": 2.850146291820379, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.13077974319458, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8521789908409119, + "num_tokens": 854945018.0, + "step": 22405 + }, + { + "epoch": 2.8502735020989696, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7550522089004517, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.857991099357605, + "num_tokens": 854988369.0, + "step": 22406 + }, + { + "epoch": 2.85040071237756, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9074465036392212, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8672380447387695, + "num_tokens": 855023935.0, + "step": 22407 + }, + { + "epoch": 2.8505279226561506, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9138672351837158, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8673032522201538, + "num_tokens": 855062002.0, + "step": 22408 + }, + { + "epoch": 2.850655132934741, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9100441932678223, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8683348894119263, + "num_tokens": 855101257.0, + "step": 22409 + }, + { + "epoch": 2.8507823432133317, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9413847923278809, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8673298358917236, + "num_tokens": 855135834.0, + "step": 22410 + }, + { + "epoch": 2.850909553491922, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9173544645309448, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8682270050048828, + "num_tokens": 855174156.0, + "step": 22411 + }, + { + "epoch": 2.8510367637705127, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0850143432617188, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.875559389591217, + "num_tokens": 855207698.0, + "step": 22412 + }, + { + "epoch": 2.8511639740491033, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9961529970169067, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8658292293548584, + "num_tokens": 855240636.0, + "step": 22413 + }, + { + "epoch": 2.851291184327694, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9894930124282837, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.868327796459198, + "num_tokens": 855279923.0, + "step": 22414 + }, + { + "epoch": 2.8514183946062843, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9489662647247314, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8540105819702148, + "num_tokens": 855315460.0, + "step": 22415 + }, + { + "epoch": 2.851545604884875, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.006300687789917, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.873187780380249, + "num_tokens": 855351548.0, + "step": 22416 + }, + { + "epoch": 2.8516728151634654, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9407333135604858, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8463855981826782, + "num_tokens": 855390715.0, + "step": 22417 + }, + { + "epoch": 2.8518000254420555, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9520890712738037, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8520622253417969, + "num_tokens": 855431997.0, + "step": 22418 + }, + { + "epoch": 2.8519272357206464, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9231696128845215, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8600037693977356, + "num_tokens": 855470242.0, + "step": 22419 + }, + { + "epoch": 2.8520544459992365, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9599716663360596, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8521692156791687, + "num_tokens": 855508901.0, + "step": 22420 + }, + { + "epoch": 2.8521816562778275, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9478697776794434, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8625632524490356, + "num_tokens": 855545290.0, + "step": 22421 + }, + { + "epoch": 2.8523088665564176, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.238172769546509, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8614050149917603, + "num_tokens": 855579567.0, + "step": 22422 + }, + { + "epoch": 2.852436076835008, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.874634027481079, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8587680459022522, + "num_tokens": 855621117.0, + "step": 22423 + }, + { + "epoch": 2.8525632871135986, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.764558792114258, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8685396909713745, + "num_tokens": 855660494.0, + "step": 22424 + }, + { + "epoch": 2.852690497392189, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.108991861343384, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.864875316619873, + "num_tokens": 855694043.0, + "step": 22425 + }, + { + "epoch": 2.8528177076707797, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9670789241790771, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8521251678466797, + "num_tokens": 855734615.0, + "step": 22426 + }, + { + "epoch": 2.85294491794937, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1991326808929443, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8771849274635315, + "num_tokens": 855771141.0, + "step": 22427 + }, + { + "epoch": 2.8530721282279607, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9489740133285522, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8638489246368408, + "num_tokens": 855806877.0, + "step": 22428 + }, + { + "epoch": 2.8531993385065513, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9980762004852295, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8790790438652039, + "num_tokens": 855840076.0, + "step": 22429 + }, + { + "epoch": 2.853326548785142, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7906277179718018, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8759914040565491, + "num_tokens": 855877972.0, + "step": 22430 + }, + { + "epoch": 2.8534537590637323, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.808982491493225, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8597283363342285, + "num_tokens": 855917424.0, + "step": 22431 + }, + { + "epoch": 2.853580969342323, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1146085262298584, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8556691408157349, + "num_tokens": 855954853.0, + "step": 22432 + }, + { + "epoch": 2.8537081796209134, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9098334312438965, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8676512241363525, + "num_tokens": 855993300.0, + "step": 22433 + }, + { + "epoch": 2.853835389899504, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8722805976867676, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8688714504241943, + "num_tokens": 856035307.0, + "step": 22434 + }, + { + "epoch": 2.8539626001780944, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9396095275878906, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8644189238548279, + "num_tokens": 856071801.0, + "step": 22435 + }, + { + "epoch": 2.854089810456685, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.002046585083008, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8548975586891174, + "num_tokens": 856112035.0, + "step": 22436 + }, + { + "epoch": 2.8542170207352755, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9471474885940552, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8703134059906006, + "num_tokens": 856148275.0, + "step": 22437 + }, + { + "epoch": 2.854344231013866, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 3.40519380569458, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8678094148635864, + "num_tokens": 856180330.0, + "step": 22438 + }, + { + "epoch": 2.8544714412924566, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3067209720611572, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8515563607215881, + "num_tokens": 856219354.0, + "step": 22439 + }, + { + "epoch": 2.854598651571047, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.013019323348999, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8760229349136353, + "num_tokens": 856257583.0, + "step": 22440 + }, + { + "epoch": 2.8547258618496376, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9317169189453125, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8479834198951721, + "num_tokens": 856298056.0, + "step": 22441 + }, + { + "epoch": 2.854853072128228, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.946572184562683, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8611049652099609, + "num_tokens": 856334109.0, + "step": 22442 + }, + { + "epoch": 2.8549802824068182, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3763673305511475, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.868505597114563, + "num_tokens": 856376655.0, + "step": 22443 + }, + { + "epoch": 2.855107492685409, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8427348136901855, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8826522827148438, + "num_tokens": 856409926.0, + "step": 22444 + }, + { + "epoch": 2.8552347029639993, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0044355392456055, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.855177104473114, + "num_tokens": 856451471.0, + "step": 22445 + }, + { + "epoch": 2.8553619132425903, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9206514358520508, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8739036321640015, + "num_tokens": 856483237.0, + "step": 22446 + }, + { + "epoch": 2.8554891235211803, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8023041486740112, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8577461242675781, + "num_tokens": 856521278.0, + "step": 22447 + }, + { + "epoch": 2.855616333799771, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8124009370803833, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8623137474060059, + "num_tokens": 856565048.0, + "step": 22448 + }, + { + "epoch": 2.8557435440783614, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.823934555053711, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8701212406158447, + "num_tokens": 856603756.0, + "step": 22449 + }, + { + "epoch": 2.855870754356952, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9142062664031982, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8713710904121399, + "num_tokens": 856640677.0, + "step": 22450 + }, + { + "epoch": 2.8559979646355425, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.85233473777771, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8650467395782471, + "num_tokens": 856679982.0, + "step": 22451 + }, + { + "epoch": 2.856125174914133, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7776063680648804, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8531122207641602, + "num_tokens": 856720621.0, + "step": 22452 + }, + { + "epoch": 2.8562523851927235, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9826964139938354, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8746635317802429, + "num_tokens": 856757172.0, + "step": 22453 + }, + { + "epoch": 2.856379595471314, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.998655080795288, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8616990447044373, + "num_tokens": 856796876.0, + "step": 22454 + }, + { + "epoch": 2.8565068057499046, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0157241821289062, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8649078607559204, + "num_tokens": 856831148.0, + "step": 22455 + }, + { + "epoch": 2.856634016028495, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9362342357635498, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8611018657684326, + "num_tokens": 856867091.0, + "step": 22456 + }, + { + "epoch": 2.8567612263070856, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9627875089645386, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8541062474250793, + "num_tokens": 856906118.0, + "step": 22457 + }, + { + "epoch": 2.856888436585676, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2019591331481934, + "learning_rate": 1e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.8391194343566895, + "num_tokens": 856942694.0, + "step": 22458 + }, + { + "epoch": 2.8570156468642667, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0945847034454346, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8570985198020935, + "num_tokens": 856977530.0, + "step": 22459 + }, + { + "epoch": 2.857142857142857, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2644693851470947, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8691356182098389, + "num_tokens": 857012417.0, + "step": 22460 + }, + { + "epoch": 2.8572700674214477, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 1.9267046451568604, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8585833311080933, + "num_tokens": 857053556.0, + "step": 22461 + }, + { + "epoch": 2.8573972777000383, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9873775243759155, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8698498606681824, + "num_tokens": 857094514.0, + "step": 22462 + }, + { + "epoch": 2.857524487978629, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8256046772003174, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8673649430274963, + "num_tokens": 857133670.0, + "step": 22463 + }, + { + "epoch": 2.8576516982572193, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8609929084777832, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8573799133300781, + "num_tokens": 857167934.0, + "step": 22464 + }, + { + "epoch": 2.85777890853581, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0933244228363037, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.847108781337738, + "num_tokens": 857208263.0, + "step": 22465 + }, + { + "epoch": 2.8579061188144, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9267460107803345, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8474860191345215, + "num_tokens": 857255107.0, + "step": 22466 + }, + { + "epoch": 2.858033329092991, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.799555778503418, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8635040521621704, + "num_tokens": 857298368.0, + "step": 22467 + }, + { + "epoch": 2.858160539371581, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.171576976776123, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.865151047706604, + "num_tokens": 857336899.0, + "step": 22468 + }, + { + "epoch": 2.858287749650172, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7702372074127197, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8694548606872559, + "num_tokens": 857378482.0, + "step": 22469 + }, + { + "epoch": 2.858414959928762, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9318686723709106, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8740020990371704, + "num_tokens": 857413203.0, + "step": 22470 + }, + { + "epoch": 2.858542170207353, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0543463230133057, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.863304853439331, + "num_tokens": 857446706.0, + "step": 22471 + }, + { + "epoch": 2.858669380485943, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9178776741027832, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8691142797470093, + "num_tokens": 857483415.0, + "step": 22472 + }, + { + "epoch": 2.8587965907645336, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 4.866605281829834, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8703118562698364, + "num_tokens": 857525323.0, + "step": 22473 + }, + { + "epoch": 2.858923801043124, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8369355201721191, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8813417553901672, + "num_tokens": 857564400.0, + "step": 22474 + }, + { + "epoch": 2.8590510113217147, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9743905067443848, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.863653838634491, + "num_tokens": 857602756.0, + "step": 22475 + }, + { + "epoch": 2.859178221600305, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1423306465148926, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8641042709350586, + "num_tokens": 857636875.0, + "step": 22476 + }, + { + "epoch": 2.8593054318788957, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8381062746047974, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8816766738891602, + "num_tokens": 857671950.0, + "step": 22477 + }, + { + "epoch": 2.8594326421574863, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.9210665225982666, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8796491622924805, + "num_tokens": 857705274.0, + "step": 22478 + }, + { + "epoch": 2.859559852436077, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8499850034713745, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8584102392196655, + "num_tokens": 857743730.0, + "step": 22479 + }, + { + "epoch": 2.8596870627146673, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.85822331905365, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.862164318561554, + "num_tokens": 857784055.0, + "step": 22480 + }, + { + "epoch": 2.859814272993258, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8215546607971191, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8716379404067993, + "num_tokens": 857822431.0, + "step": 22481 + }, + { + "epoch": 2.8599414832718484, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8372076749801636, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8770244121551514, + "num_tokens": 857859258.0, + "step": 22482 + }, + { + "epoch": 2.860068693550439, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.955002784729004, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8562997579574585, + "num_tokens": 857901209.0, + "step": 22483 + }, + { + "epoch": 2.8601959038290294, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.883362889289856, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8768951296806335, + "num_tokens": 857935563.0, + "step": 22484 + }, + { + "epoch": 2.86032311410762, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.006338357925415, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8545553684234619, + "num_tokens": 857966777.0, + "step": 22485 + }, + { + "epoch": 2.8604503243862105, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7894519567489624, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8721083998680115, + "num_tokens": 858005078.0, + "step": 22486 + }, + { + "epoch": 2.860577534664801, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8489285707473755, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8527507781982422, + "num_tokens": 858049288.0, + "step": 22487 + }, + { + "epoch": 2.8607047449433916, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.972869873046875, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8876889944076538, + "num_tokens": 858089834.0, + "step": 22488 + }, + { + "epoch": 2.860831955221982, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8659499883651733, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8711939454078674, + "num_tokens": 858124152.0, + "step": 22489 + }, + { + "epoch": 2.8609591655005726, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.820401668548584, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8696337342262268, + "num_tokens": 858163202.0, + "step": 22490 + }, + { + "epoch": 2.8610863757791627, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8148928880691528, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8860217928886414, + "num_tokens": 858199173.0, + "step": 22491 + }, + { + "epoch": 2.8612135860577537, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.879272699356079, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8640705347061157, + "num_tokens": 858239693.0, + "step": 22492 + }, + { + "epoch": 2.8613407963363438, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9530558586120605, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.873083770275116, + "num_tokens": 858280720.0, + "step": 22493 + }, + { + "epoch": 2.8614680066149347, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.928038477897644, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.877912163734436, + "num_tokens": 858314145.0, + "step": 22494 + }, + { + "epoch": 2.861595216893525, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7835015058517456, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8668918013572693, + "num_tokens": 858355177.0, + "step": 22495 + }, + { + "epoch": 2.861722427172116, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.01932954788208, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.869187593460083, + "num_tokens": 858393070.0, + "step": 22496 + }, + { + "epoch": 2.861849637450706, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.901863932609558, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8655738830566406, + "num_tokens": 858431827.0, + "step": 22497 + }, + { + "epoch": 2.8619768477292964, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8604655265808105, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8636780977249146, + "num_tokens": 858474990.0, + "step": 22498 + }, + { + "epoch": 2.862104058007887, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7306898832321167, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8787780404090881, + "num_tokens": 858514698.0, + "step": 22499 + }, + { + "epoch": 2.8622312682864774, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8514704704284668, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8666014075279236, + "num_tokens": 858557601.0, + "step": 22500 + }, + { + "epoch": 2.862358478565068, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.702597975730896, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8755037784576416, + "num_tokens": 858596830.0, + "step": 22501 + }, + { + "epoch": 2.8624856888436585, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8516435623168945, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8562859296798706, + "num_tokens": 858637555.0, + "step": 22502 + }, + { + "epoch": 2.862612899122249, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.044685125350952, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.884583055973053, + "num_tokens": 858672813.0, + "step": 22503 + }, + { + "epoch": 2.8627401094008396, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.0152933597564697, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8735595941543579, + "num_tokens": 858710743.0, + "step": 22504 + }, + { + "epoch": 2.86286731967943, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.089495897293091, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8787111043930054, + "num_tokens": 858739486.0, + "step": 22505 + }, + { + "epoch": 2.8629945299580206, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9069576263427734, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8462111949920654, + "num_tokens": 858780678.0, + "step": 22506 + }, + { + "epoch": 2.863121740236611, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.750180959701538, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8621257543563843, + "num_tokens": 858826112.0, + "step": 22507 + }, + { + "epoch": 2.8632489505152017, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.7645070552825928, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8768537044525146, + "num_tokens": 858862158.0, + "step": 22508 + }, + { + "epoch": 2.863376160793792, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8455016613006592, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8703453540802002, + "num_tokens": 858905270.0, + "step": 22509 + }, + { + "epoch": 2.8635033710723827, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8745616674423218, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8605443239212036, + "num_tokens": 858941802.0, + "step": 22510 + }, + { + "epoch": 2.8636305813509733, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.046194553375244, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8683308959007263, + "num_tokens": 858974672.0, + "step": 22511 + }, + { + "epoch": 2.863757791629564, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8683987855911255, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8642104864120483, + "num_tokens": 859015305.0, + "step": 22512 + }, + { + "epoch": 2.8638850019081543, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.097809076309204, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8577223420143127, + "num_tokens": 859050646.0, + "step": 22513 + }, + { + "epoch": 2.864012212186745, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.855322241783142, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8758009076118469, + "num_tokens": 859090139.0, + "step": 22514 + }, + { + "epoch": 2.8641394224653354, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9944409132003784, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8706580400466919, + "num_tokens": 859130661.0, + "step": 22515 + }, + { + "epoch": 2.8642666327439255, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9311052560806274, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.865423321723938, + "num_tokens": 859166399.0, + "step": 22516 + }, + { + "epoch": 2.8643938430225164, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.801983118057251, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8664929866790771, + "num_tokens": 859208349.0, + "step": 22517 + }, + { + "epoch": 2.8645210533011065, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9073606729507446, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8681091070175171, + "num_tokens": 859243784.0, + "step": 22518 + }, + { + "epoch": 2.8646482635796975, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9977163076400757, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8524216413497925, + "num_tokens": 859278345.0, + "step": 22519 + }, + { + "epoch": 2.8647754738582876, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.952135682106018, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8556868433952332, + "num_tokens": 859317454.0, + "step": 22520 + }, + { + "epoch": 2.864902684136878, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9896315336227417, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8622099161148071, + "num_tokens": 859352076.0, + "step": 22521 + }, + { + "epoch": 2.8650298944154686, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9764792919158936, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8737121820449829, + "num_tokens": 859389957.0, + "step": 22522 + }, + { + "epoch": 2.865157104694059, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.022007703781128, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8661940693855286, + "num_tokens": 859425167.0, + "step": 22523 + }, + { + "epoch": 2.8652843149726497, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.815102219581604, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8633761405944824, + "num_tokens": 859470999.0, + "step": 22524 + }, + { + "epoch": 2.86541152525124, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9738613367080688, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.872873842716217, + "num_tokens": 859503763.0, + "step": 22525 + }, + { + "epoch": 2.8655387355298307, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.226679801940918, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8757314085960388, + "num_tokens": 859537739.0, + "step": 22526 + }, + { + "epoch": 2.8656659458084213, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.925045371055603, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8765502572059631, + "num_tokens": 859573497.0, + "step": 22527 + }, + { + "epoch": 2.865793156087012, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.061471462249756, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8620393872261047, + "num_tokens": 859606997.0, + "step": 22528 + }, + { + "epoch": 2.8659203663656023, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8650498390197754, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8537607192993164, + "num_tokens": 859642278.0, + "step": 22529 + }, + { + "epoch": 2.866047576644193, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.841187596321106, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8786338567733765, + "num_tokens": 859684728.0, + "step": 22530 + }, + { + "epoch": 2.8661747869227834, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9391052722930908, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8595775365829468, + "num_tokens": 859720946.0, + "step": 22531 + }, + { + "epoch": 2.866301997201374, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0658175945281982, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8631832599639893, + "num_tokens": 859752605.0, + "step": 22532 + }, + { + "epoch": 2.8664292074799644, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9405256509780884, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.878059446811676, + "num_tokens": 859784206.0, + "step": 22533 + }, + { + "epoch": 2.866556417758555, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8125042915344238, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8640283346176147, + "num_tokens": 859828142.0, + "step": 22534 + }, + { + "epoch": 2.8666836280371455, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.108165740966797, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8629393577575684, + "num_tokens": 859870032.0, + "step": 22535 + }, + { + "epoch": 2.866810838315736, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7724465131759644, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8571251630783081, + "num_tokens": 859911417.0, + "step": 22536 + }, + { + "epoch": 2.8669380485943265, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7881238460540771, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8772122859954834, + "num_tokens": 859953929.0, + "step": 22537 + }, + { + "epoch": 2.867065258872917, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9854605197906494, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8686553239822388, + "num_tokens": 859985402.0, + "step": 22538 + }, + { + "epoch": 2.8671924691515076, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.104483127593994, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8632532358169556, + "num_tokens": 860017374.0, + "step": 22539 + }, + { + "epoch": 2.867319679430098, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7269606590270996, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8760549426078796, + "num_tokens": 860058217.0, + "step": 22540 + }, + { + "epoch": 2.867446889708688, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.104586362838745, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.849515438079834, + "num_tokens": 860092317.0, + "step": 22541 + }, + { + "epoch": 2.867574099987279, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.957385778427124, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8631830811500549, + "num_tokens": 860125067.0, + "step": 22542 + }, + { + "epoch": 2.8677013102658693, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7886402606964111, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8798903226852417, + "num_tokens": 860163805.0, + "step": 22543 + }, + { + "epoch": 2.8678285205444602, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0163321495056152, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8532256484031677, + "num_tokens": 860200178.0, + "step": 22544 + }, + { + "epoch": 2.8679557308230503, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.040109634399414, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8647137880325317, + "num_tokens": 860235283.0, + "step": 22545 + }, + { + "epoch": 2.868082941101641, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7250295877456665, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8838801383972168, + "num_tokens": 860276776.0, + "step": 22546 + }, + { + "epoch": 2.8682101513802314, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9565759897232056, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8709191679954529, + "num_tokens": 860318492.0, + "step": 22547 + }, + { + "epoch": 2.868337361658822, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9666297435760498, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.851037323474884, + "num_tokens": 860358556.0, + "step": 22548 + }, + { + "epoch": 2.8684645719374124, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.213571071624756, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8545172214508057, + "num_tokens": 860391114.0, + "step": 22549 + }, + { + "epoch": 2.868591782216003, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9392226934432983, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8621681928634644, + "num_tokens": 860430696.0, + "step": 22550 + }, + { + "epoch": 2.8687189924945935, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0103392601013184, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8733102083206177, + "num_tokens": 860471055.0, + "step": 22551 + }, + { + "epoch": 2.868846202773184, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9143658876419067, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8604583740234375, + "num_tokens": 860512472.0, + "step": 22552 + }, + { + "epoch": 2.8689734130517746, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1615800857543945, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8620493412017822, + "num_tokens": 860541014.0, + "step": 22553 + }, + { + "epoch": 2.869100623330365, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.66079580783844, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8813234567642212, + "num_tokens": 860585025.0, + "step": 22554 + }, + { + "epoch": 2.8692278336089556, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0901331901550293, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8743391036987305, + "num_tokens": 860616361.0, + "step": 22555 + }, + { + "epoch": 2.869355043887546, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9273961782455444, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8582514524459839, + "num_tokens": 860655976.0, + "step": 22556 + }, + { + "epoch": 2.8694822541661367, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8085261583328247, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8537347316741943, + "num_tokens": 860703949.0, + "step": 22557 + }, + { + "epoch": 2.869609464444727, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.004223108291626, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8666749000549316, + "num_tokens": 860739935.0, + "step": 22558 + }, + { + "epoch": 2.8697366747233177, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7761789560317993, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8685492277145386, + "num_tokens": 860782483.0, + "step": 22559 + }, + { + "epoch": 2.8698638850019083, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.071734666824341, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8549169898033142, + "num_tokens": 860816821.0, + "step": 22560 + }, + { + "epoch": 2.869991095280499, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0601558685302734, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8548570871353149, + "num_tokens": 860851313.0, + "step": 22561 + }, + { + "epoch": 2.8701183055590893, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9534176588058472, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8742108345031738, + "num_tokens": 860890540.0, + "step": 22562 + }, + { + "epoch": 2.87024551583768, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.171663999557495, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8586204051971436, + "num_tokens": 860929686.0, + "step": 22563 + }, + { + "epoch": 2.87037272611627, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8167117834091187, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.867760956287384, + "num_tokens": 860968186.0, + "step": 22564 + }, + { + "epoch": 2.870499936394861, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8245993852615356, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8608956933021545, + "num_tokens": 861007252.0, + "step": 22565 + }, + { + "epoch": 2.870627146673451, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7748764753341675, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8679752349853516, + "num_tokens": 861049898.0, + "step": 22566 + }, + { + "epoch": 2.870754356952042, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1448252201080322, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8724730014801025, + "num_tokens": 861087711.0, + "step": 22567 + }, + { + "epoch": 2.870881567230632, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9690260887145996, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8448804616928101, + "num_tokens": 861128566.0, + "step": 22568 + }, + { + "epoch": 2.871008777509223, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0635359287261963, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8702026605606079, + "num_tokens": 861156299.0, + "step": 22569 + }, + { + "epoch": 2.871135987787813, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9715168476104736, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8778302669525146, + "num_tokens": 861189376.0, + "step": 22570 + }, + { + "epoch": 2.8712631980664036, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0723156929016113, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8612658381462097, + "num_tokens": 861227156.0, + "step": 22571 + }, + { + "epoch": 2.871390408344994, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.867309808731079, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8653292655944824, + "num_tokens": 861268088.0, + "step": 22572 + }, + { + "epoch": 2.8715176186235847, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7613142728805542, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8677297830581665, + "num_tokens": 861308892.0, + "step": 22573 + }, + { + "epoch": 2.871644828902175, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8678557872772217, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8695677518844604, + "num_tokens": 861350108.0, + "step": 22574 + }, + { + "epoch": 2.8717720391807657, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7898346185684204, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8772403597831726, + "num_tokens": 861390553.0, + "step": 22575 + }, + { + "epoch": 2.8718992494593563, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8618834018707275, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8677873015403748, + "num_tokens": 861425878.0, + "step": 22576 + }, + { + "epoch": 2.872026459737947, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8810254335403442, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8817805051803589, + "num_tokens": 861460935.0, + "step": 22577 + }, + { + "epoch": 2.8721536700165373, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9499549865722656, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.848962664604187, + "num_tokens": 861500446.0, + "step": 22578 + }, + { + "epoch": 2.872280880295128, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 5.127812385559082, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8537369966506958, + "num_tokens": 861538000.0, + "step": 22579 + }, + { + "epoch": 2.8724080905737184, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1282026767730713, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8688786029815674, + "num_tokens": 861578052.0, + "step": 22580 + }, + { + "epoch": 2.872535300852309, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9346867799758911, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8624343872070312, + "num_tokens": 861619065.0, + "step": 22581 + }, + { + "epoch": 2.8726625111308994, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8589271306991577, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8692688941955566, + "num_tokens": 861660112.0, + "step": 22582 + }, + { + "epoch": 2.87278972140949, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9223504066467285, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8686519861221313, + "num_tokens": 861699312.0, + "step": 22583 + }, + { + "epoch": 2.8729169316880805, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.836279034614563, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.883246123790741, + "num_tokens": 861735167.0, + "step": 22584 + }, + { + "epoch": 2.873044141966671, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7813947200775146, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8594001531600952, + "num_tokens": 861778594.0, + "step": 22585 + }, + { + "epoch": 2.8731713522452615, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8928430080413818, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8653924465179443, + "num_tokens": 861814754.0, + "step": 22586 + }, + { + "epoch": 2.873298562523852, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.94407320022583, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8500588536262512, + "num_tokens": 861849019.0, + "step": 22587 + }, + { + "epoch": 2.8734257728024426, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8825767040252686, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8631561994552612, + "num_tokens": 861889471.0, + "step": 22588 + }, + { + "epoch": 2.8735529830810327, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.843319058418274, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8587246537208557, + "num_tokens": 861926811.0, + "step": 22589 + }, + { + "epoch": 2.8736801933596237, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9511420726776123, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8635406494140625, + "num_tokens": 861963628.0, + "step": 22590 + }, + { + "epoch": 2.8738074036382137, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9856305122375488, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8518523573875427, + "num_tokens": 862002915.0, + "step": 22591 + }, + { + "epoch": 2.8739346139168047, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7192052602767944, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8714072108268738, + "num_tokens": 862047276.0, + "step": 22592 + }, + { + "epoch": 2.874061824195395, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6966267824172974, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8730893731117249, + "num_tokens": 862091241.0, + "step": 22593 + }, + { + "epoch": 2.8741890344739858, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.8516510725021362, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8685585260391235, + "num_tokens": 862126025.0, + "step": 22594 + }, + { + "epoch": 2.874316244752576, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9563573598861694, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8565495610237122, + "num_tokens": 862160822.0, + "step": 22595 + }, + { + "epoch": 2.8744434550311664, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8546563386917114, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8701996803283691, + "num_tokens": 862196808.0, + "step": 22596 + }, + { + "epoch": 2.874570665309757, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9577982425689697, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8500144481658936, + "num_tokens": 862233722.0, + "step": 22597 + }, + { + "epoch": 2.8746978755883474, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7317721843719482, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8828639984130859, + "num_tokens": 862271914.0, + "step": 22598 + }, + { + "epoch": 2.874825085866938, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7415379285812378, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8627737760543823, + "num_tokens": 862313696.0, + "step": 22599 + }, + { + "epoch": 2.8749522961455285, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.9390783309936523, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8596400022506714, + "num_tokens": 862355366.0, + "step": 22600 + }, + { + "epoch": 2.875079506424119, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9496746063232422, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8582265377044678, + "num_tokens": 862395630.0, + "step": 22601 + }, + { + "epoch": 2.8752067167027096, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.003551721572876, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8653618097305298, + "num_tokens": 862429650.0, + "step": 22602 + }, + { + "epoch": 2.8753339269813, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.900632381439209, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8634529113769531, + "num_tokens": 862470921.0, + "step": 22603 + }, + { + "epoch": 2.8754611372598906, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.061537027359009, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8715565204620361, + "num_tokens": 862505204.0, + "step": 22604 + }, + { + "epoch": 2.875588347538481, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8372372388839722, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8637966513633728, + "num_tokens": 862548686.0, + "step": 22605 + }, + { + "epoch": 2.8757155578170717, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.827353835105896, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8791564702987671, + "num_tokens": 862585920.0, + "step": 22606 + }, + { + "epoch": 2.875842768095662, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9523123502731323, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8593540787696838, + "num_tokens": 862619364.0, + "step": 22607 + }, + { + "epoch": 2.8759699783742527, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8411749601364136, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8614309430122375, + "num_tokens": 862661317.0, + "step": 22608 + }, + { + "epoch": 2.8760971886528433, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8791093826293945, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8703442811965942, + "num_tokens": 862698236.0, + "step": 22609 + }, + { + "epoch": 2.876224398931434, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8732911348342896, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8772854804992676, + "num_tokens": 862739224.0, + "step": 22610 + }, + { + "epoch": 2.8763516092100243, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9348430633544922, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8804370164871216, + "num_tokens": 862776534.0, + "step": 22611 + }, + { + "epoch": 2.876478819488615, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8981910943984985, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8584085702896118, + "num_tokens": 862815690.0, + "step": 22612 + }, + { + "epoch": 2.8766060297672054, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8777137994766235, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8657203316688538, + "num_tokens": 862851021.0, + "step": 22613 + }, + { + "epoch": 2.8767332400457954, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9720983505249023, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8730936050415039, + "num_tokens": 862885122.0, + "step": 22614 + }, + { + "epoch": 2.8768604503243864, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8729326725006104, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8722946047782898, + "num_tokens": 862922233.0, + "step": 22615 + }, + { + "epoch": 2.8769876606029765, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8999439477920532, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8784791231155396, + "num_tokens": 862961355.0, + "step": 22616 + }, + { + "epoch": 2.8771148708815675, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8005475997924805, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8752655386924744, + "num_tokens": 862998523.0, + "step": 22617 + }, + { + "epoch": 2.8772420811601576, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.762953758239746, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8654035329818726, + "num_tokens": 863039684.0, + "step": 22618 + }, + { + "epoch": 2.877369291438748, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.915962815284729, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8703180551528931, + "num_tokens": 863077649.0, + "step": 22619 + }, + { + "epoch": 2.8774965017173386, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8692100048065186, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8623675107955933, + "num_tokens": 863116816.0, + "step": 22620 + }, + { + "epoch": 2.877623711995929, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9838693141937256, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8772470951080322, + "num_tokens": 863155306.0, + "step": 22621 + }, + { + "epoch": 2.8777509222745197, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.907031536102295, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.873497724533081, + "num_tokens": 863194381.0, + "step": 22622 + }, + { + "epoch": 2.87787813255311, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0672507286071777, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8650327920913696, + "num_tokens": 863231470.0, + "step": 22623 + }, + { + "epoch": 2.8780053428317007, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.121385335922241, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8585077524185181, + "num_tokens": 863265617.0, + "step": 22624 + }, + { + "epoch": 2.8781325531102913, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0057790279388428, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8567112684249878, + "num_tokens": 863300111.0, + "step": 22625 + }, + { + "epoch": 2.878259763388882, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8385244607925415, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8740291595458984, + "num_tokens": 863338626.0, + "step": 22626 + }, + { + "epoch": 2.8783869736674723, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.000544548034668, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8579944968223572, + "num_tokens": 863374672.0, + "step": 22627 + }, + { + "epoch": 2.878514183946063, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0899741649627686, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8600737452507019, + "num_tokens": 863412717.0, + "step": 22628 + }, + { + "epoch": 2.8786413942246534, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.012960910797119, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8744359016418457, + "num_tokens": 863447776.0, + "step": 22629 + }, + { + "epoch": 2.878768604503244, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9160292148590088, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8824788331985474, + "num_tokens": 863485697.0, + "step": 22630 + }, + { + "epoch": 2.8788958147818344, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.873393177986145, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8612086772918701, + "num_tokens": 863528706.0, + "step": 22631 + }, + { + "epoch": 2.879023025060425, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.017066478729248, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8484793901443481, + "num_tokens": 863569181.0, + "step": 22632 + }, + { + "epoch": 2.8791502353390155, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.114142894744873, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.849331796169281, + "num_tokens": 863602985.0, + "step": 22633 + }, + { + "epoch": 2.879277445617606, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1766176223754883, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8701878786087036, + "num_tokens": 863634590.0, + "step": 22634 + }, + { + "epoch": 2.8794046558961965, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7964861392974854, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8658941984176636, + "num_tokens": 863677671.0, + "step": 22635 + }, + { + "epoch": 2.879531866174787, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8534668684005737, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.868672251701355, + "num_tokens": 863716102.0, + "step": 22636 + }, + { + "epoch": 2.8796590764533776, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0488240718841553, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8581030964851379, + "num_tokens": 863751560.0, + "step": 22637 + }, + { + "epoch": 2.879786286731968, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8080495595932007, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8796188831329346, + "num_tokens": 863792373.0, + "step": 22638 + }, + { + "epoch": 2.879913497010558, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8997105360031128, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8624853491783142, + "num_tokens": 863829969.0, + "step": 22639 + }, + { + "epoch": 2.880040707289149, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9321675300598145, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8731365203857422, + "num_tokens": 863867080.0, + "step": 22640 + }, + { + "epoch": 2.8801679175677393, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9692829847335815, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8695667386054993, + "num_tokens": 863904230.0, + "step": 22641 + }, + { + "epoch": 2.8802951278463302, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.822412371635437, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8706352710723877, + "num_tokens": 863947309.0, + "step": 22642 + }, + { + "epoch": 2.8804223381249203, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8883745670318604, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8722140192985535, + "num_tokens": 863985557.0, + "step": 22643 + }, + { + "epoch": 2.880549548403511, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9743623733520508, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8549849987030029, + "num_tokens": 864021842.0, + "step": 22644 + }, + { + "epoch": 2.8806767586821014, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8368061780929565, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8586606979370117, + "num_tokens": 864060765.0, + "step": 22645 + }, + { + "epoch": 2.880803968960692, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9053735733032227, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.868729829788208, + "num_tokens": 864100624.0, + "step": 22646 + }, + { + "epoch": 2.8809311792392824, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9024453163146973, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8777549266815186, + "num_tokens": 864136599.0, + "step": 22647 + }, + { + "epoch": 2.881058389517873, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.013455867767334, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8705917000770569, + "num_tokens": 864169876.0, + "step": 22648 + }, + { + "epoch": 2.8811855997964635, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8652399778366089, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8648564219474792, + "num_tokens": 864213709.0, + "step": 22649 + }, + { + "epoch": 2.881312810075054, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0029022693634033, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8724084496498108, + "num_tokens": 864245581.0, + "step": 22650 + }, + { + "epoch": 2.8814400203536445, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0972812175750732, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8517051339149475, + "num_tokens": 864287081.0, + "step": 22651 + }, + { + "epoch": 2.881567230632235, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0324511528015137, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8493703603744507, + "num_tokens": 864321480.0, + "step": 22652 + }, + { + "epoch": 2.8816944409108256, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8946614265441895, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8729855418205261, + "num_tokens": 864358580.0, + "step": 22653 + }, + { + "epoch": 2.881821651189416, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9315983057022095, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8549414873123169, + "num_tokens": 864396277.0, + "step": 22654 + }, + { + "epoch": 2.8819488614680067, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9812439680099487, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.861831784248352, + "num_tokens": 864433282.0, + "step": 22655 + }, + { + "epoch": 2.882076071746597, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9159513711929321, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.857756495475769, + "num_tokens": 864469305.0, + "step": 22656 + }, + { + "epoch": 2.8822032820251877, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.935688853263855, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8667415380477905, + "num_tokens": 864507246.0, + "step": 22657 + }, + { + "epoch": 2.8823304923037782, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.824648141860962, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.86162269115448, + "num_tokens": 864547315.0, + "step": 22658 + }, + { + "epoch": 2.8824577025823688, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8906049728393555, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8593817949295044, + "num_tokens": 864590813.0, + "step": 22659 + }, + { + "epoch": 2.8825849128609593, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9812161922454834, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8560258746147156, + "num_tokens": 864625078.0, + "step": 22660 + }, + { + "epoch": 2.88271212313955, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8606336116790771, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8741105198860168, + "num_tokens": 864663714.0, + "step": 22661 + }, + { + "epoch": 2.88283933341814, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.091494083404541, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.864864706993103, + "num_tokens": 864697026.0, + "step": 22662 + }, + { + "epoch": 2.882966543696731, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9656856060028076, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.875383198261261, + "num_tokens": 864732337.0, + "step": 22663 + }, + { + "epoch": 2.883093753975321, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.736462950706482, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.859241247177124, + "num_tokens": 864776598.0, + "step": 22664 + }, + { + "epoch": 2.883220964253912, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8575304746627808, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8527923822402954, + "num_tokens": 864817279.0, + "step": 22665 + }, + { + "epoch": 2.883348174532502, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.83837890625, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8735328912734985, + "num_tokens": 864854914.0, + "step": 22666 + }, + { + "epoch": 2.883475384811093, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.927189826965332, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8693139553070068, + "num_tokens": 864892914.0, + "step": 22667 + }, + { + "epoch": 2.883602595089683, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8314545154571533, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8629823327064514, + "num_tokens": 864934189.0, + "step": 22668 + }, + { + "epoch": 2.8837298053682736, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8558454513549805, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8748177289962769, + "num_tokens": 864972085.0, + "step": 22669 + }, + { + "epoch": 2.883857015646864, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0822086334228516, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8497745990753174, + "num_tokens": 865008674.0, + "step": 22670 + }, + { + "epoch": 2.8839842259254547, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9033373594284058, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8553408980369568, + "num_tokens": 865046004.0, + "step": 22671 + }, + { + "epoch": 2.884111436204045, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9235771894454956, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8684831261634827, + "num_tokens": 865082409.0, + "step": 22672 + }, + { + "epoch": 2.8842386464826357, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.95200777053833, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8649829030036926, + "num_tokens": 865124457.0, + "step": 22673 + }, + { + "epoch": 2.8843658567612263, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9522550106048584, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8781702518463135, + "num_tokens": 865162290.0, + "step": 22674 + }, + { + "epoch": 2.884493067039817, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8213129043579102, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8621309399604797, + "num_tokens": 865201686.0, + "step": 22675 + }, + { + "epoch": 2.8846202773184073, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9742918014526367, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8544116020202637, + "num_tokens": 865236876.0, + "step": 22676 + }, + { + "epoch": 2.884747487596998, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.906447172164917, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8695448637008667, + "num_tokens": 865276994.0, + "step": 22677 + }, + { + "epoch": 2.8848746978755884, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.036872386932373, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8643902540206909, + "num_tokens": 865311990.0, + "step": 22678 + }, + { + "epoch": 2.885001908154179, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9496458768844604, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8710522055625916, + "num_tokens": 865352324.0, + "step": 22679 + }, + { + "epoch": 2.8851291184327694, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.939341425895691, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.865467369556427, + "num_tokens": 865391428.0, + "step": 22680 + }, + { + "epoch": 2.88525632871136, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.871056318283081, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8624486923217773, + "num_tokens": 865428478.0, + "step": 22681 + }, + { + "epoch": 2.8853835389899505, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9432263374328613, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8647347688674927, + "num_tokens": 865466531.0, + "step": 22682 + }, + { + "epoch": 2.885510749268541, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9517019987106323, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8730140924453735, + "num_tokens": 865503177.0, + "step": 22683 + }, + { + "epoch": 2.8856379595471315, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.037916421890259, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8586856722831726, + "num_tokens": 865539454.0, + "step": 22684 + }, + { + "epoch": 2.885765169825722, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.061020612716675, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8842579126358032, + "num_tokens": 865571937.0, + "step": 22685 + }, + { + "epoch": 2.8858923801043126, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8520750999450684, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8686344623565674, + "num_tokens": 865609457.0, + "step": 22686 + }, + { + "epoch": 2.8860195903829027, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0488293170928955, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8530114889144897, + "num_tokens": 865647391.0, + "step": 22687 + }, + { + "epoch": 2.8861468006614936, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9832043647766113, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8652327060699463, + "num_tokens": 865680578.0, + "step": 22688 + }, + { + "epoch": 2.8862740109400837, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.993638277053833, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8629103899002075, + "num_tokens": 865714610.0, + "step": 22689 + }, + { + "epoch": 2.8864012212186747, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8954983949661255, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8615312576293945, + "num_tokens": 865755672.0, + "step": 22690 + }, + { + "epoch": 2.886528431497265, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.025198221206665, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8721591234207153, + "num_tokens": 865790111.0, + "step": 22691 + }, + { + "epoch": 2.8866556417758558, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.033501386642456, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8647099137306213, + "num_tokens": 865825580.0, + "step": 22692 + }, + { + "epoch": 2.886782852054446, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8095091581344604, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8697280883789062, + "num_tokens": 865869099.0, + "step": 22693 + }, + { + "epoch": 2.8869100623330364, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.045501947402954, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8560059666633606, + "num_tokens": 865906957.0, + "step": 22694 + }, + { + "epoch": 2.887037272611627, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9264498949050903, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8760255575180054, + "num_tokens": 865945712.0, + "step": 22695 + }, + { + "epoch": 2.8871644828902174, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8676000833511353, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8520009517669678, + "num_tokens": 865984469.0, + "step": 22696 + }, + { + "epoch": 2.887291693168808, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8454244136810303, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8671292066574097, + "num_tokens": 866024624.0, + "step": 22697 + }, + { + "epoch": 2.8874189034473985, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.850197434425354, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8628252744674683, + "num_tokens": 866062446.0, + "step": 22698 + }, + { + "epoch": 2.887546113725989, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.828672170639038, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8756265044212341, + "num_tokens": 866100896.0, + "step": 22699 + }, + { + "epoch": 2.8876733240045795, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0149474143981934, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8516237139701843, + "num_tokens": 866134668.0, + "step": 22700 + }, + { + "epoch": 2.88780053428317, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9808235168457031, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8590654730796814, + "num_tokens": 866173834.0, + "step": 22701 + }, + { + "epoch": 2.8879277445617606, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0439774990081787, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8476498126983643, + "num_tokens": 866211691.0, + "step": 22702 + }, + { + "epoch": 2.888054954840351, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0566928386688232, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8639559149742126, + "num_tokens": 866240878.0, + "step": 22703 + }, + { + "epoch": 2.8881821651189417, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7329421043395996, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8519272804260254, + "num_tokens": 866289552.0, + "step": 22704 + }, + { + "epoch": 2.888309375397532, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8562957048416138, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8560810685157776, + "num_tokens": 866330361.0, + "step": 22705 + }, + { + "epoch": 2.8884365856761227, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2558553218841553, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8681010007858276, + "num_tokens": 866371557.0, + "step": 22706 + }, + { + "epoch": 2.8885637959547132, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9841622114181519, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8663437366485596, + "num_tokens": 866410627.0, + "step": 22707 + }, + { + "epoch": 2.8886910062333038, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0391454696655273, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8627603054046631, + "num_tokens": 866446036.0, + "step": 22708 + }, + { + "epoch": 2.8888182165118943, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7942228317260742, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8558523654937744, + "num_tokens": 866489655.0, + "step": 22709 + }, + { + "epoch": 2.888945426790485, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7284891605377197, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8579106330871582, + "num_tokens": 866532750.0, + "step": 22710 + }, + { + "epoch": 2.8890726370690754, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8142828941345215, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.87190181016922, + "num_tokens": 866570748.0, + "step": 22711 + }, + { + "epoch": 2.8891998473476654, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.134718894958496, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.861060619354248, + "num_tokens": 866609422.0, + "step": 22712 + }, + { + "epoch": 2.8893270576262564, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9061882495880127, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8655235767364502, + "num_tokens": 866650865.0, + "step": 22713 + }, + { + "epoch": 2.8894542679048465, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8292033672332764, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8635736703872681, + "num_tokens": 866687605.0, + "step": 22714 + }, + { + "epoch": 2.8895814781834375, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9472099542617798, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8553791046142578, + "num_tokens": 866722974.0, + "step": 22715 + }, + { + "epoch": 2.8897086884620276, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9818265438079834, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8639332056045532, + "num_tokens": 866760694.0, + "step": 22716 + }, + { + "epoch": 2.889835898740618, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0920112133026123, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8474882245063782, + "num_tokens": 866795102.0, + "step": 22717 + }, + { + "epoch": 2.8899631090192086, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9387842416763306, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8794664144515991, + "num_tokens": 866831282.0, + "step": 22718 + }, + { + "epoch": 2.890090319297799, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.991426706314087, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8580096364021301, + "num_tokens": 866872661.0, + "step": 22719 + }, + { + "epoch": 2.8902175295763897, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9571870565414429, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8560616970062256, + "num_tokens": 866917576.0, + "step": 22720 + }, + { + "epoch": 2.89034473985498, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0076279640197754, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8690906167030334, + "num_tokens": 866957245.0, + "step": 22721 + }, + { + "epoch": 2.8904719501335707, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9782923460006714, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8362089395523071, + "num_tokens": 866998695.0, + "step": 22722 + }, + { + "epoch": 2.8905991604121613, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.285552501678467, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8507753610610962, + "num_tokens": 867033502.0, + "step": 22723 + }, + { + "epoch": 2.890726370690752, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0531911849975586, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8473668694496155, + "num_tokens": 867076032.0, + "step": 22724 + }, + { + "epoch": 2.8908535809693423, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1688971519470215, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8651084303855896, + "num_tokens": 867107429.0, + "step": 22725 + }, + { + "epoch": 2.890980791247933, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.020975112915039, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8687881231307983, + "num_tokens": 867139432.0, + "step": 22726 + }, + { + "epoch": 2.8911080015265234, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9103156328201294, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8611146211624146, + "num_tokens": 867180588.0, + "step": 22727 + }, + { + "epoch": 2.891235211805114, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0647988319396973, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8607876896858215, + "num_tokens": 867215170.0, + "step": 22728 + }, + { + "epoch": 2.8913624220837044, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7735577821731567, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8808951377868652, + "num_tokens": 867254958.0, + "step": 22729 + }, + { + "epoch": 2.891489632362295, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9751380681991577, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8631277084350586, + "num_tokens": 867291135.0, + "step": 22730 + }, + { + "epoch": 2.8916168426408855, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8468626737594604, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.873785674571991, + "num_tokens": 867326926.0, + "step": 22731 + }, + { + "epoch": 2.891744052919476, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0096888542175293, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8555275797843933, + "num_tokens": 867361142.0, + "step": 22732 + }, + { + "epoch": 2.8918712631980665, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9568125009536743, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8826037645339966, + "num_tokens": 867398344.0, + "step": 22733 + }, + { + "epoch": 2.891998473476657, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.772218942642212, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8729294538497925, + "num_tokens": 867437642.0, + "step": 22734 + }, + { + "epoch": 2.8921256837552476, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9183547496795654, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8544007539749146, + "num_tokens": 867471646.0, + "step": 22735 + }, + { + "epoch": 2.892252894033838, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.924346685409546, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8667421936988831, + "num_tokens": 867509682.0, + "step": 22736 + }, + { + "epoch": 2.892380104312428, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.043585777282715, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8607491850852966, + "num_tokens": 867544425.0, + "step": 22737 + }, + { + "epoch": 2.892507314591019, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.917400598526001, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8576096892356873, + "num_tokens": 867584978.0, + "step": 22738 + }, + { + "epoch": 2.8926345248696093, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0684423446655273, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8661672472953796, + "num_tokens": 867620360.0, + "step": 22739 + }, + { + "epoch": 2.8927617351482002, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.018120765686035, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8734627962112427, + "num_tokens": 867655246.0, + "step": 22740 + }, + { + "epoch": 2.8928889454267903, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9695907831192017, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8553336262702942, + "num_tokens": 867692792.0, + "step": 22741 + }, + { + "epoch": 2.893016155705381, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8774826526641846, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8678709268569946, + "num_tokens": 867732365.0, + "step": 22742 + }, + { + "epoch": 2.8931433659839714, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.917250394821167, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8517448902130127, + "num_tokens": 867764800.0, + "step": 22743 + }, + { + "epoch": 2.893270576262562, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.837731957435608, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8577128648757935, + "num_tokens": 867807107.0, + "step": 22744 + }, + { + "epoch": 2.8933977865411524, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8835421800613403, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8542742729187012, + "num_tokens": 867850225.0, + "step": 22745 + }, + { + "epoch": 2.893524996819743, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7758740186691284, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8605477213859558, + "num_tokens": 867890457.0, + "step": 22746 + }, + { + "epoch": 2.8936522070983335, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.877672791481018, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8702086806297302, + "num_tokens": 867925880.0, + "step": 22747 + }, + { + "epoch": 2.893779417376924, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7316062450408936, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.879428505897522, + "num_tokens": 867963923.0, + "step": 22748 + }, + { + "epoch": 2.8939066276555145, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8369174003601074, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8679926991462708, + "num_tokens": 868000489.0, + "step": 22749 + }, + { + "epoch": 2.894033837934105, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.083218812942505, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.86745285987854, + "num_tokens": 868034001.0, + "step": 22750 + }, + { + "epoch": 2.8941610482126956, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8891315460205078, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.868664026260376, + "num_tokens": 868074956.0, + "step": 22751 + }, + { + "epoch": 2.894288258491286, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8978437185287476, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8738532066345215, + "num_tokens": 868110727.0, + "step": 22752 + }, + { + "epoch": 2.8944154687698767, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8488404750823975, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8452166318893433, + "num_tokens": 868151480.0, + "step": 22753 + }, + { + "epoch": 2.894542679048467, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9546657800674438, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8562394380569458, + "num_tokens": 868190620.0, + "step": 22754 + }, + { + "epoch": 2.8946698893270577, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.869276523590088, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8633731603622437, + "num_tokens": 868230598.0, + "step": 22755 + }, + { + "epoch": 2.8947970996056482, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9742584228515625, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8572300672531128, + "num_tokens": 868265933.0, + "step": 22756 + }, + { + "epoch": 2.8949243098842388, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8558967113494873, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.867671012878418, + "num_tokens": 868300282.0, + "step": 22757 + }, + { + "epoch": 2.8950515201628293, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.003248691558838, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8808709979057312, + "num_tokens": 868331031.0, + "step": 22758 + }, + { + "epoch": 2.89517873044142, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.113407611846924, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8660600185394287, + "num_tokens": 868368268.0, + "step": 22759 + }, + { + "epoch": 2.89530594072001, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0553250312805176, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8486950993537903, + "num_tokens": 868407551.0, + "step": 22760 + }, + { + "epoch": 2.895433150998601, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.842734456062317, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8622286319732666, + "num_tokens": 868449301.0, + "step": 22761 + }, + { + "epoch": 2.895560361277191, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9502671957015991, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8621649742126465, + "num_tokens": 868482853.0, + "step": 22762 + }, + { + "epoch": 2.895687571555782, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9607539176940918, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8596391677856445, + "num_tokens": 868523960.0, + "step": 22763 + }, + { + "epoch": 2.895814781834372, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.111583948135376, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.857714831829071, + "num_tokens": 868566447.0, + "step": 22764 + }, + { + "epoch": 2.895941992112963, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7983734607696533, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8749961853027344, + "num_tokens": 868608342.0, + "step": 22765 + }, + { + "epoch": 2.896069202391553, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8383252620697021, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8761311769485474, + "num_tokens": 868647354.0, + "step": 22766 + }, + { + "epoch": 2.8961964126701436, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9939892292022705, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8471938371658325, + "num_tokens": 868690716.0, + "step": 22767 + }, + { + "epoch": 2.896323622948734, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9013314247131348, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8721709847450256, + "num_tokens": 868728318.0, + "step": 22768 + }, + { + "epoch": 2.8964508332273247, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8912018537521362, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8796850442886353, + "num_tokens": 868764323.0, + "step": 22769 + }, + { + "epoch": 2.896578043505915, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8750507831573486, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8475416898727417, + "num_tokens": 868801371.0, + "step": 22770 + }, + { + "epoch": 2.8967052537845057, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7376998662948608, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8703474402427673, + "num_tokens": 868846679.0, + "step": 22771 + }, + { + "epoch": 2.8968324640630962, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.143793821334839, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8506966829299927, + "num_tokens": 868880076.0, + "step": 22772 + }, + { + "epoch": 2.8969596743416868, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0875561237335205, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8653379678726196, + "num_tokens": 868917086.0, + "step": 22773 + }, + { + "epoch": 2.8970868846202773, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9613232612609863, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8703035116195679, + "num_tokens": 868954952.0, + "step": 22774 + }, + { + "epoch": 2.897214094898868, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0709047317504883, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8597951531410217, + "num_tokens": 868990911.0, + "step": 22775 + }, + { + "epoch": 2.8973413051774584, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.6944323778152466, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8756825923919678, + "num_tokens": 869033398.0, + "step": 22776 + }, + { + "epoch": 2.897468515456049, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.612804889678955, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8519032001495361, + "num_tokens": 869070513.0, + "step": 22777 + }, + { + "epoch": 2.8975957257346394, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7935103178024292, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8726949691772461, + "num_tokens": 869113494.0, + "step": 22778 + }, + { + "epoch": 2.89772293601323, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9121012687683105, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8754531145095825, + "num_tokens": 869147433.0, + "step": 22779 + }, + { + "epoch": 2.8978501462918205, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.977331519126892, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8622573614120483, + "num_tokens": 869186763.0, + "step": 22780 + }, + { + "epoch": 2.897977356570411, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0634379386901855, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8775190114974976, + "num_tokens": 869222624.0, + "step": 22781 + }, + { + "epoch": 2.8981045668490015, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8232773542404175, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8868468999862671, + "num_tokens": 869263027.0, + "step": 22782 + }, + { + "epoch": 2.898231777127592, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9060986042022705, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8607090711593628, + "num_tokens": 869304232.0, + "step": 22783 + }, + { + "epoch": 2.8983589874061826, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.86381196975708, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8715521097183228, + "num_tokens": 869340141.0, + "step": 22784 + }, + { + "epoch": 2.8984861976847727, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8493014574050903, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8576677441596985, + "num_tokens": 869382428.0, + "step": 22785 + }, + { + "epoch": 2.8986134079633636, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9058647155761719, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8728376626968384, + "num_tokens": 869418890.0, + "step": 22786 + }, + { + "epoch": 2.8987406182419537, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8284658193588257, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8657358288764954, + "num_tokens": 869453704.0, + "step": 22787 + }, + { + "epoch": 2.8988678285205447, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8584866523742676, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8607427477836609, + "num_tokens": 869491845.0, + "step": 22788 + }, + { + "epoch": 2.898995038799135, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1217939853668213, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8759766221046448, + "num_tokens": 869523393.0, + "step": 22789 + }, + { + "epoch": 2.8991222490777258, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8652541637420654, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.863068699836731, + "num_tokens": 869561927.0, + "step": 22790 + }, + { + "epoch": 2.899249459356316, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8489118814468384, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8645884990692139, + "num_tokens": 869599568.0, + "step": 22791 + }, + { + "epoch": 2.8993766696349064, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.795806646347046, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8685672283172607, + "num_tokens": 869636329.0, + "step": 22792 + }, + { + "epoch": 2.899503879913497, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7542346715927124, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8594716191291809, + "num_tokens": 869679720.0, + "step": 22793 + }, + { + "epoch": 2.8996310901920874, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8459540605545044, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8677747249603271, + "num_tokens": 869719527.0, + "step": 22794 + }, + { + "epoch": 2.899758300470678, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9755167961120605, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8706157207489014, + "num_tokens": 869755139.0, + "step": 22795 + }, + { + "epoch": 2.8998855107492685, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0684783458709717, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8590754866600037, + "num_tokens": 869791860.0, + "step": 22796 + }, + { + "epoch": 2.900012721027859, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.912907600402832, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8870127201080322, + "num_tokens": 869827294.0, + "step": 22797 + }, + { + "epoch": 2.9001399313064495, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0766310691833496, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8601635098457336, + "num_tokens": 869859398.0, + "step": 22798 + }, + { + "epoch": 2.90026714158504, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8147097826004028, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8644722700119019, + "num_tokens": 869898215.0, + "step": 22799 + }, + { + "epoch": 2.9003943518636306, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0411317348480225, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8533328175544739, + "num_tokens": 869939787.0, + "step": 22800 + }, + { + "epoch": 2.900521562142221, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.062390089035034, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.85663902759552, + "num_tokens": 869976790.0, + "step": 22801 + }, + { + "epoch": 2.9006487724208116, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.742337703704834, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8696109652519226, + "num_tokens": 870013547.0, + "step": 22802 + }, + { + "epoch": 2.900775982699402, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7387722730636597, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8777234554290771, + "num_tokens": 870049540.0, + "step": 22803 + }, + { + "epoch": 2.9009031929779927, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0854976177215576, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8737463355064392, + "num_tokens": 870088247.0, + "step": 22804 + }, + { + "epoch": 2.9010304032565832, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9154690504074097, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8651858568191528, + "num_tokens": 870129112.0, + "step": 22805 + }, + { + "epoch": 2.9011576135351738, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1998770236968994, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8660428524017334, + "num_tokens": 870162082.0, + "step": 22806 + }, + { + "epoch": 2.9012848238137643, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.1605660915374756, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8761458396911621, + "num_tokens": 870199868.0, + "step": 22807 + }, + { + "epoch": 2.901412034092355, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0160276889801025, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8710941076278687, + "num_tokens": 870238121.0, + "step": 22808 + }, + { + "epoch": 2.9015392443709453, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.880767822265625, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8560642004013062, + "num_tokens": 870278008.0, + "step": 22809 + }, + { + "epoch": 2.9016664546495354, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8373208045959473, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8617771863937378, + "num_tokens": 870317671.0, + "step": 22810 + }, + { + "epoch": 2.9017936649281264, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.105839729309082, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8709901571273804, + "num_tokens": 870352893.0, + "step": 22811 + }, + { + "epoch": 2.9019208752067165, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9853843450546265, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8725948333740234, + "num_tokens": 870387513.0, + "step": 22812 + }, + { + "epoch": 2.9020480854853075, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7825920581817627, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8721545934677124, + "num_tokens": 870428649.0, + "step": 22813 + }, + { + "epoch": 2.9021752957638975, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8350121974945068, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8809888958930969, + "num_tokens": 870468329.0, + "step": 22814 + }, + { + "epoch": 2.902302506042488, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.254056930541992, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8658161163330078, + "num_tokens": 870509273.0, + "step": 22815 + }, + { + "epoch": 2.9024297163210786, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.203174352645874, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8593615293502808, + "num_tokens": 870544206.0, + "step": 22816 + }, + { + "epoch": 2.902556926599669, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9600956439971924, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8813037872314453, + "num_tokens": 870578708.0, + "step": 22817 + }, + { + "epoch": 2.9026841368782597, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7803854942321777, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8724544048309326, + "num_tokens": 870620599.0, + "step": 22818 + }, + { + "epoch": 2.90281134715685, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9144725799560547, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8788816928863525, + "num_tokens": 870657828.0, + "step": 22819 + }, + { + "epoch": 2.9029385574354407, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.03291654586792, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8606374263763428, + "num_tokens": 870694786.0, + "step": 22820 + }, + { + "epoch": 2.9030657677140312, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7727923393249512, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8837946653366089, + "num_tokens": 870733850.0, + "step": 22821 + }, + { + "epoch": 2.9031929779926218, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8740047216415405, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8674166202545166, + "num_tokens": 870771556.0, + "step": 22822 + }, + { + "epoch": 2.9033201882712123, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.191263437271118, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8757089376449585, + "num_tokens": 870809335.0, + "step": 22823 + }, + { + "epoch": 2.903447398549803, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0659890174865723, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.861361026763916, + "num_tokens": 870844239.0, + "step": 22824 + }, + { + "epoch": 2.9035746088283934, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9347865581512451, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8663804531097412, + "num_tokens": 870879540.0, + "step": 22825 + }, + { + "epoch": 2.903701819106984, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.050030469894409, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8647040128707886, + "num_tokens": 870917219.0, + "step": 22826 + }, + { + "epoch": 2.9038290293855744, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3597171306610107, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8674055337905884, + "num_tokens": 870952332.0, + "step": 22827 + }, + { + "epoch": 2.903956239664165, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.059067964553833, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8524581789970398, + "num_tokens": 870991609.0, + "step": 22828 + }, + { + "epoch": 2.9040834499427555, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1116020679473877, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8695819973945618, + "num_tokens": 871027456.0, + "step": 22829 + }, + { + "epoch": 2.904210660221346, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.822143316268921, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8574937582015991, + "num_tokens": 871071833.0, + "step": 22830 + }, + { + "epoch": 2.9043378704999365, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8811124563217163, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8648267388343811, + "num_tokens": 871105446.0, + "step": 22831 + }, + { + "epoch": 2.904465080778527, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9359915256500244, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8641979694366455, + "num_tokens": 871141408.0, + "step": 22832 + }, + { + "epoch": 2.9045922910571176, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0952823162078857, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8678132891654968, + "num_tokens": 871173296.0, + "step": 22833 + }, + { + "epoch": 2.904719501335708, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.874932885169983, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8620190620422363, + "num_tokens": 871210711.0, + "step": 22834 + }, + { + "epoch": 2.904846711614298, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.033440113067627, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8621729612350464, + "num_tokens": 871251179.0, + "step": 22835 + }, + { + "epoch": 2.904973921892889, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7776110172271729, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.884886622428894, + "num_tokens": 871295758.0, + "step": 22836 + }, + { + "epoch": 2.9051011321714793, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2272772789001465, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8490949869155884, + "num_tokens": 871331145.0, + "step": 22837 + }, + { + "epoch": 2.9052283424500702, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2679660320281982, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.839868426322937, + "num_tokens": 871372819.0, + "step": 22838 + }, + { + "epoch": 2.9053555527286603, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7696820497512817, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8599222898483276, + "num_tokens": 871414223.0, + "step": 22839 + }, + { + "epoch": 2.905482763007251, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.793066382408142, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8698288798332214, + "num_tokens": 871455013.0, + "step": 22840 + }, + { + "epoch": 2.9056099732858414, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8605518341064453, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.864188551902771, + "num_tokens": 871495462.0, + "step": 22841 + }, + { + "epoch": 2.905737183564432, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.826393723487854, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8673648834228516, + "num_tokens": 871534999.0, + "step": 22842 + }, + { + "epoch": 2.9058643938430224, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0017528533935547, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8665951490402222, + "num_tokens": 871569816.0, + "step": 22843 + }, + { + "epoch": 2.905991604121613, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0683839321136475, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.86879962682724, + "num_tokens": 871610441.0, + "step": 22844 + }, + { + "epoch": 2.9061188144002035, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0067012310028076, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8520153164863586, + "num_tokens": 871649005.0, + "step": 22845 + }, + { + "epoch": 2.906246024678794, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.924411416053772, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8608354330062866, + "num_tokens": 871689263.0, + "step": 22846 + }, + { + "epoch": 2.9063732349573845, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9436407089233398, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8747882843017578, + "num_tokens": 871725440.0, + "step": 22847 + }, + { + "epoch": 2.906500445235975, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8507187366485596, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8626658320426941, + "num_tokens": 871767839.0, + "step": 22848 + }, + { + "epoch": 2.9066276555145656, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0100314617156982, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8694436550140381, + "num_tokens": 871801516.0, + "step": 22849 + }, + { + "epoch": 2.906754865793156, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8227040767669678, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8615689277648926, + "num_tokens": 871840359.0, + "step": 22850 + }, + { + "epoch": 2.9068820760717466, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.883590579032898, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8696373701095581, + "num_tokens": 871879396.0, + "step": 22851 + }, + { + "epoch": 2.907009286350337, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0617358684539795, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8595702052116394, + "num_tokens": 871915043.0, + "step": 22852 + }, + { + "epoch": 2.9071364966289277, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8871885538101196, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8603342771530151, + "num_tokens": 871953391.0, + "step": 22853 + }, + { + "epoch": 2.9072637069075182, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8754563331604004, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.869049608707428, + "num_tokens": 871989879.0, + "step": 22854 + }, + { + "epoch": 2.9073909171861088, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.261807441711426, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8681882619857788, + "num_tokens": 872022428.0, + "step": 22855 + }, + { + "epoch": 2.9075181274646993, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9537618160247803, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8713584542274475, + "num_tokens": 872060644.0, + "step": 22856 + }, + { + "epoch": 2.90764533774329, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9560275077819824, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8627849817276001, + "num_tokens": 872098230.0, + "step": 22857 + }, + { + "epoch": 2.90777254802188, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9155257940292358, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8539518713951111, + "num_tokens": 872131217.0, + "step": 22858 + }, + { + "epoch": 2.907899758300471, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9564054012298584, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.873780369758606, + "num_tokens": 872163912.0, + "step": 22859 + }, + { + "epoch": 2.908026968579061, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.047851085662842, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8415070176124573, + "num_tokens": 872200694.0, + "step": 22860 + }, + { + "epoch": 2.908154178857652, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8543416261672974, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8659044504165649, + "num_tokens": 872241156.0, + "step": 22861 + }, + { + "epoch": 2.908281389136242, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.003359794616699, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.867331862449646, + "num_tokens": 872273048.0, + "step": 22862 + }, + { + "epoch": 2.908408599414833, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8264726400375366, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8696045875549316, + "num_tokens": 872312165.0, + "step": 22863 + }, + { + "epoch": 2.908535809693423, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8859535455703735, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8683929443359375, + "num_tokens": 872352740.0, + "step": 22864 + }, + { + "epoch": 2.9086630199720136, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8371089696884155, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8523938655853271, + "num_tokens": 872397800.0, + "step": 22865 + }, + { + "epoch": 2.908790230250604, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.026430130004883, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.877996563911438, + "num_tokens": 872429307.0, + "step": 22866 + }, + { + "epoch": 2.9089174405291947, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0482215881347656, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8620023727416992, + "num_tokens": 872465897.0, + "step": 22867 + }, + { + "epoch": 2.909044650807785, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9068925380706787, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.855583667755127, + "num_tokens": 872503252.0, + "step": 22868 + }, + { + "epoch": 2.9091718610863757, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9442265033721924, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8677191138267517, + "num_tokens": 872546276.0, + "step": 22869 + }, + { + "epoch": 2.9092990713649662, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7856552600860596, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8634098172187805, + "num_tokens": 872585690.0, + "step": 22870 + }, + { + "epoch": 2.9094262816435568, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.001811981201172, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8554191589355469, + "num_tokens": 872621718.0, + "step": 22871 + }, + { + "epoch": 2.9095534919221473, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.008668899536133, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8692532777786255, + "num_tokens": 872659349.0, + "step": 22872 + }, + { + "epoch": 2.909680702200738, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7410011291503906, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8623118996620178, + "num_tokens": 872704468.0, + "step": 22873 + }, + { + "epoch": 2.9098079124793284, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.043910264968872, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8656567335128784, + "num_tokens": 872740003.0, + "step": 22874 + }, + { + "epoch": 2.909935122757919, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.250129461288452, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8444634675979614, + "num_tokens": 872780362.0, + "step": 22875 + }, + { + "epoch": 2.9100623330365094, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7248586416244507, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8731975555419922, + "num_tokens": 872823719.0, + "step": 22876 + }, + { + "epoch": 2.9101895433151, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2029616832733154, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8680910468101501, + "num_tokens": 872856276.0, + "step": 22877 + }, + { + "epoch": 2.9103167535936905, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8321410417556763, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.858548104763031, + "num_tokens": 872893688.0, + "step": 22878 + }, + { + "epoch": 2.910443963872281, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8938214778900146, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8841648101806641, + "num_tokens": 872931169.0, + "step": 22879 + }, + { + "epoch": 2.9105711741508715, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7665148973464966, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8602685332298279, + "num_tokens": 872970836.0, + "step": 22880 + }, + { + "epoch": 2.910698384429462, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7557610273361206, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8796999454498291, + "num_tokens": 873010879.0, + "step": 22881 + }, + { + "epoch": 2.9108255947080526, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7785042524337769, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8674716949462891, + "num_tokens": 873058018.0, + "step": 22882 + }, + { + "epoch": 2.9109528049866427, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9188859462738037, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8562929630279541, + "num_tokens": 873096259.0, + "step": 22883 + }, + { + "epoch": 2.9110800152652336, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.117882013320923, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8556617498397827, + "num_tokens": 873131955.0, + "step": 22884 + }, + { + "epoch": 2.9112072255438237, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.960573434829712, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.860579252243042, + "num_tokens": 873171024.0, + "step": 22885 + }, + { + "epoch": 2.9113344358224147, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.025697708129883, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.870227038860321, + "num_tokens": 873210568.0, + "step": 22886 + }, + { + "epoch": 2.9114616461010048, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8143208026885986, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8681720495223999, + "num_tokens": 873250025.0, + "step": 22887 + }, + { + "epoch": 2.9115888563795957, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9411847591400146, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8581147789955139, + "num_tokens": 873285284.0, + "step": 22888 + }, + { + "epoch": 2.911716066658186, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.01031756401062, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.868255615234375, + "num_tokens": 873320102.0, + "step": 22889 + }, + { + "epoch": 2.9118432769367764, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.158784866333008, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8695480227470398, + "num_tokens": 873354668.0, + "step": 22890 + }, + { + "epoch": 2.911970487215367, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0767266750335693, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8614968657493591, + "num_tokens": 873387415.0, + "step": 22891 + }, + { + "epoch": 2.9120976974939574, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7437487840652466, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8843634128570557, + "num_tokens": 873428679.0, + "step": 22892 + }, + { + "epoch": 2.912224907772548, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.89421546459198, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8651552200317383, + "num_tokens": 873466973.0, + "step": 22893 + }, + { + "epoch": 2.9123521180511385, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2780840396881104, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8796436190605164, + "num_tokens": 873510957.0, + "step": 22894 + }, + { + "epoch": 2.912479328329729, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9723979234695435, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8445863723754883, + "num_tokens": 873546456.0, + "step": 22895 + }, + { + "epoch": 2.9126065386083195, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7986618280410767, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8508192300796509, + "num_tokens": 873591972.0, + "step": 22896 + }, + { + "epoch": 2.91273374888691, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9223960638046265, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8636531829833984, + "num_tokens": 873629790.0, + "step": 22897 + }, + { + "epoch": 2.9128609591655006, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8403592109680176, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.865863025188446, + "num_tokens": 873667419.0, + "step": 22898 + }, + { + "epoch": 2.912988169444091, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8477030992507935, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8753299713134766, + "num_tokens": 873705834.0, + "step": 22899 + }, + { + "epoch": 2.9131153797226816, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.147618055343628, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8577169179916382, + "num_tokens": 873748859.0, + "step": 22900 + }, + { + "epoch": 2.913242590001272, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8889590501785278, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.856948733329773, + "num_tokens": 873788068.0, + "step": 22901 + }, + { + "epoch": 2.9133698002798627, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8803647756576538, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8729840517044067, + "num_tokens": 873827820.0, + "step": 22902 + }, + { + "epoch": 2.9134970105584532, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8884936571121216, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8627563714981079, + "num_tokens": 873866271.0, + "step": 22903 + }, + { + "epoch": 2.9136242208370438, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8209874629974365, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8620448112487793, + "num_tokens": 873907409.0, + "step": 22904 + }, + { + "epoch": 2.9137514311156343, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.831647515296936, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.873420774936676, + "num_tokens": 873947940.0, + "step": 22905 + }, + { + "epoch": 2.913878641394225, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7568613290786743, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8514474034309387, + "num_tokens": 873990062.0, + "step": 22906 + }, + { + "epoch": 2.9140058516728153, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7756580114364624, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8769007921218872, + "num_tokens": 874026660.0, + "step": 22907 + }, + { + "epoch": 2.9141330619514054, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.860058307647705, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8721344470977783, + "num_tokens": 874064935.0, + "step": 22908 + }, + { + "epoch": 2.9142602722299964, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3962652683258057, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8653647899627686, + "num_tokens": 874102807.0, + "step": 22909 + }, + { + "epoch": 2.9143874825085865, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8837711811065674, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8682990074157715, + "num_tokens": 874139688.0, + "step": 22910 + }, + { + "epoch": 2.9145146927871775, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8152116537094116, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8716551065444946, + "num_tokens": 874184505.0, + "step": 22911 + }, + { + "epoch": 2.9146419030657675, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8685615062713623, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8655823469161987, + "num_tokens": 874223210.0, + "step": 22912 + }, + { + "epoch": 2.914769113344358, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.008624792098999, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8536556363105774, + "num_tokens": 874263939.0, + "step": 22913 + }, + { + "epoch": 2.9148963236229486, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8559621572494507, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8737959861755371, + "num_tokens": 874303946.0, + "step": 22914 + }, + { + "epoch": 2.915023533901539, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9697697162628174, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8677217960357666, + "num_tokens": 874343701.0, + "step": 22915 + }, + { + "epoch": 2.9151507441801296, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9762595891952515, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8678542971611023, + "num_tokens": 874380425.0, + "step": 22916 + }, + { + "epoch": 2.91527795445872, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9646855592727661, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8674052953720093, + "num_tokens": 874417643.0, + "step": 22917 + }, + { + "epoch": 2.9154051647373107, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0579614639282227, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.858036994934082, + "num_tokens": 874463939.0, + "step": 22918 + }, + { + "epoch": 2.9155323750159012, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7996360063552856, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8614035844802856, + "num_tokens": 874503156.0, + "step": 22919 + }, + { + "epoch": 2.9156595852944918, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8251709938049316, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8534440994262695, + "num_tokens": 874546754.0, + "step": 22920 + }, + { + "epoch": 2.9157867955730823, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.628672480583191, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8687976598739624, + "num_tokens": 874590781.0, + "step": 22921 + }, + { + "epoch": 2.915914005851673, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8873502016067505, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.875785231590271, + "num_tokens": 874629592.0, + "step": 22922 + }, + { + "epoch": 2.9160412161302633, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9318605661392212, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.871886134147644, + "num_tokens": 874667372.0, + "step": 22923 + }, + { + "epoch": 2.916168426408854, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.814044713973999, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8620635867118835, + "num_tokens": 874704687.0, + "step": 22924 + }, + { + "epoch": 2.9162956366874444, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9723150730133057, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8644352555274963, + "num_tokens": 874740471.0, + "step": 22925 + }, + { + "epoch": 2.916422846966035, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0117740631103516, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8713663220405579, + "num_tokens": 874780747.0, + "step": 22926 + }, + { + "epoch": 2.9165500572446255, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9331871271133423, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8702238202095032, + "num_tokens": 874814204.0, + "step": 22927 + }, + { + "epoch": 2.916677267523216, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0453007221221924, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8650377988815308, + "num_tokens": 874851872.0, + "step": 22928 + }, + { + "epoch": 2.9168044778018065, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0793910026550293, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8506323099136353, + "num_tokens": 874884215.0, + "step": 22929 + }, + { + "epoch": 2.916931688080397, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0033159255981445, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8632584810256958, + "num_tokens": 874919047.0, + "step": 22930 + }, + { + "epoch": 2.917058898358987, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1543259620666504, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8582998514175415, + "num_tokens": 874956469.0, + "step": 22931 + }, + { + "epoch": 2.917186108637578, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9456459283828735, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8761755228042603, + "num_tokens": 874994633.0, + "step": 22932 + }, + { + "epoch": 2.917313318916168, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0894434452056885, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8793433904647827, + "num_tokens": 875028530.0, + "step": 22933 + }, + { + "epoch": 2.917440529194759, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.114603281021118, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8575122356414795, + "num_tokens": 875059785.0, + "step": 22934 + }, + { + "epoch": 2.9175677394733492, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.059068202972412, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8630408048629761, + "num_tokens": 875096057.0, + "step": 22935 + }, + { + "epoch": 2.91769494975194, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7255253791809082, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8713240623474121, + "num_tokens": 875141988.0, + "step": 22936 + }, + { + "epoch": 2.9178221600305303, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8146824836730957, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8703458309173584, + "num_tokens": 875176973.0, + "step": 22937 + }, + { + "epoch": 2.917949370309121, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9301562309265137, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8752834796905518, + "num_tokens": 875209271.0, + "step": 22938 + }, + { + "epoch": 2.9180765805877114, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7971175909042358, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8707260489463806, + "num_tokens": 875247460.0, + "step": 22939 + }, + { + "epoch": 2.918203790866302, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8146288394927979, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8644427061080933, + "num_tokens": 875286013.0, + "step": 22940 + }, + { + "epoch": 2.9183310011448924, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8243381977081299, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8609596490859985, + "num_tokens": 875324547.0, + "step": 22941 + }, + { + "epoch": 2.918458211423483, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.69973886013031, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8573379516601562, + "num_tokens": 875366655.0, + "step": 22942 + }, + { + "epoch": 2.9185854217020735, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7974698543548584, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8599668741226196, + "num_tokens": 875408571.0, + "step": 22943 + }, + { + "epoch": 2.918712631980664, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7960437536239624, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8594963550567627, + "num_tokens": 875450730.0, + "step": 22944 + }, + { + "epoch": 2.9188398422592545, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.938419222831726, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8684964179992676, + "num_tokens": 875486850.0, + "step": 22945 + }, + { + "epoch": 2.918967052537845, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8454729318618774, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8585259914398193, + "num_tokens": 875528653.0, + "step": 22946 + }, + { + "epoch": 2.9190942628164356, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8973805904388428, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8757915496826172, + "num_tokens": 875564155.0, + "step": 22947 + }, + { + "epoch": 2.919221473095026, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9169079065322876, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8650740385055542, + "num_tokens": 875598548.0, + "step": 22948 + }, + { + "epoch": 2.9193486833736166, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8390653133392334, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8814620971679688, + "num_tokens": 875634587.0, + "step": 22949 + }, + { + "epoch": 2.919475893652207, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.011265516281128, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8642660975456238, + "num_tokens": 875671956.0, + "step": 22950 + }, + { + "epoch": 2.9196031039307977, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9972933530807495, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8687065839767456, + "num_tokens": 875706327.0, + "step": 22951 + }, + { + "epoch": 2.9197303142093882, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9119905233383179, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8721628189086914, + "num_tokens": 875749320.0, + "step": 22952 + }, + { + "epoch": 2.9198575244879788, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1617283821105957, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8756756782531738, + "num_tokens": 875783871.0, + "step": 22953 + }, + { + "epoch": 2.9199847347665693, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9004158973693848, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8659202456474304, + "num_tokens": 875821716.0, + "step": 22954 + }, + { + "epoch": 2.92011194504516, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9552730321884155, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8691960573196411, + "num_tokens": 875854339.0, + "step": 22955 + }, + { + "epoch": 2.92023915532375, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8804528713226318, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8655858635902405, + "num_tokens": 875889456.0, + "step": 22956 + }, + { + "epoch": 2.920366365602341, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8705686330795288, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8568947315216064, + "num_tokens": 875926659.0, + "step": 22957 + }, + { + "epoch": 2.920493575880931, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.756049633026123, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8739213943481445, + "num_tokens": 875972766.0, + "step": 22958 + }, + { + "epoch": 2.920620786159522, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0209708213806152, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8609720468521118, + "num_tokens": 876011824.0, + "step": 22959 + }, + { + "epoch": 2.920747996438112, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.062316656112671, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8475562334060669, + "num_tokens": 876050353.0, + "step": 22960 + }, + { + "epoch": 2.920875206716703, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9259223937988281, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.851265013217926, + "num_tokens": 876089358.0, + "step": 22961 + }, + { + "epoch": 2.921002416995293, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.013906717300415, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8513293266296387, + "num_tokens": 876126287.0, + "step": 22962 + }, + { + "epoch": 2.9211296272738836, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.074327230453491, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8752156496047974, + "num_tokens": 876165157.0, + "step": 22963 + }, + { + "epoch": 2.921256837552474, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.887862205505371, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8663589358329773, + "num_tokens": 876201391.0, + "step": 22964 + }, + { + "epoch": 2.9213840478310646, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6597099304199219, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8731019496917725, + "num_tokens": 876243004.0, + "step": 22965 + }, + { + "epoch": 2.921511258109655, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8178002834320068, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8680811524391174, + "num_tokens": 876279245.0, + "step": 22966 + }, + { + "epoch": 2.9216384683882457, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.231323719024658, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8637477159500122, + "num_tokens": 876312145.0, + "step": 22967 + }, + { + "epoch": 2.9217656786668362, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9784132242202759, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8665989637374878, + "num_tokens": 876351354.0, + "step": 22968 + }, + { + "epoch": 2.9218928889454268, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0441994667053223, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8548766374588013, + "num_tokens": 876388621.0, + "step": 22969 + }, + { + "epoch": 2.9220200992240173, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3085074424743652, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.859298825263977, + "num_tokens": 876418377.0, + "step": 22970 + }, + { + "epoch": 2.922147309502608, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9862936735153198, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.868699848651886, + "num_tokens": 876453351.0, + "step": 22971 + }, + { + "epoch": 2.9222745197811983, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.166872024536133, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.871536910533905, + "num_tokens": 876484830.0, + "step": 22972 + }, + { + "epoch": 2.922401730059789, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8462071418762207, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8762543201446533, + "num_tokens": 876524318.0, + "step": 22973 + }, + { + "epoch": 2.9225289403383794, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.3638617992401123, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8633365631103516, + "num_tokens": 876562421.0, + "step": 22974 + }, + { + "epoch": 2.92265615061697, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9534164667129517, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8593934178352356, + "num_tokens": 876605681.0, + "step": 22975 + }, + { + "epoch": 2.9227833608955605, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7725226879119873, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8553910255432129, + "num_tokens": 876647371.0, + "step": 22976 + }, + { + "epoch": 2.922910571174151, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8584613800048828, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8713705539703369, + "num_tokens": 876685563.0, + "step": 22977 + }, + { + "epoch": 2.9230377814527415, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0712673664093018, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8611578941345215, + "num_tokens": 876718404.0, + "step": 22978 + }, + { + "epoch": 2.923164991731332, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0546486377716064, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8680759072303772, + "num_tokens": 876751888.0, + "step": 22979 + }, + { + "epoch": 2.9232922020099226, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.5420775413513184, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8704895973205566, + "num_tokens": 876786813.0, + "step": 22980 + }, + { + "epoch": 2.9234194122885127, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0024890899658203, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8755050897598267, + "num_tokens": 876824511.0, + "step": 22981 + }, + { + "epoch": 2.9235466225671036, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9992693662643433, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8678845167160034, + "num_tokens": 876856409.0, + "step": 22982 + }, + { + "epoch": 2.9236738328456937, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9591617584228516, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8496874570846558, + "num_tokens": 876889204.0, + "step": 22983 + }, + { + "epoch": 2.9238010431242847, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1037230491638184, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8673760890960693, + "num_tokens": 876927185.0, + "step": 22984 + }, + { + "epoch": 2.9239282534028748, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.965646982192993, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.862899899482727, + "num_tokens": 876965744.0, + "step": 22985 + }, + { + "epoch": 2.9240554636814657, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0785701274871826, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8728413581848145, + "num_tokens": 876997107.0, + "step": 22986 + }, + { + "epoch": 2.924182673960056, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8826924562454224, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8553684949874878, + "num_tokens": 877037894.0, + "step": 22987 + }, + { + "epoch": 2.9243098842386464, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.82050359249115, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8532223701477051, + "num_tokens": 877078724.0, + "step": 22988 + }, + { + "epoch": 2.924437094517237, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8816794157028198, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8657741546630859, + "num_tokens": 877119685.0, + "step": 22989 + }, + { + "epoch": 2.9245643047958274, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9632866382598877, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8687479496002197, + "num_tokens": 877157055.0, + "step": 22990 + }, + { + "epoch": 2.924691515074418, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9749025106430054, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8712093830108643, + "num_tokens": 877188569.0, + "step": 22991 + }, + { + "epoch": 2.9248187253530085, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.960852861404419, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8566171526908875, + "num_tokens": 877222745.0, + "step": 22992 + }, + { + "epoch": 2.924945935631599, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9115623235702515, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8494501113891602, + "num_tokens": 877262510.0, + "step": 22993 + }, + { + "epoch": 2.9250731459101895, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.368029832839966, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8591912388801575, + "num_tokens": 877300164.0, + "step": 22994 + }, + { + "epoch": 2.92520035618878, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0237133502960205, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8746707439422607, + "num_tokens": 877337187.0, + "step": 22995 + }, + { + "epoch": 2.9253275664673706, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.073068141937256, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8459762334823608, + "num_tokens": 877375902.0, + "step": 22996 + }, + { + "epoch": 2.925454776745961, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7934550046920776, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8644843101501465, + "num_tokens": 877414406.0, + "step": 22997 + }, + { + "epoch": 2.9255819870245516, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7835264205932617, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8725175857543945, + "num_tokens": 877448977.0, + "step": 22998 + }, + { + "epoch": 2.925709197303142, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7483525276184082, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8730913400650024, + "num_tokens": 877486766.0, + "step": 22999 + }, + { + "epoch": 2.9258364075817327, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 3.0853612422943115, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8797193765640259, + "num_tokens": 877519320.0, + "step": 23000 + }, + { + "epoch": 2.925963617860323, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0248498916625977, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8404013514518738, + "num_tokens": 877557913.0, + "step": 23001 + }, + { + "epoch": 2.9260908281389137, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.2814695835113525, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8642861843109131, + "num_tokens": 877591197.0, + "step": 23002 + }, + { + "epoch": 2.9262180384175043, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.7658450603485107, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8812865614891052, + "num_tokens": 877625018.0, + "step": 23003 + }, + { + "epoch": 2.926345248696095, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8824464082717896, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8694875240325928, + "num_tokens": 877662433.0, + "step": 23004 + }, + { + "epoch": 2.9264724589746853, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7331125736236572, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8777892589569092, + "num_tokens": 877703576.0, + "step": 23005 + }, + { + "epoch": 2.9265996692532754, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.956450343132019, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8598813414573669, + "num_tokens": 877734025.0, + "step": 23006 + }, + { + "epoch": 2.9267268795318664, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8923006057739258, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8766592144966125, + "num_tokens": 877771942.0, + "step": 23007 + }, + { + "epoch": 2.9268540898104565, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9124054908752441, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8688730001449585, + "num_tokens": 877815662.0, + "step": 23008 + }, + { + "epoch": 2.9269813000890474, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 3.0231916904449463, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8481991291046143, + "num_tokens": 877855398.0, + "step": 23009 + }, + { + "epoch": 2.9271085103676375, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7901593446731567, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8594034910202026, + "num_tokens": 877898544.0, + "step": 23010 + }, + { + "epoch": 2.927235720646228, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.830274224281311, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8652454614639282, + "num_tokens": 877937914.0, + "step": 23011 + }, + { + "epoch": 2.9273629309248186, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7672513723373413, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8488065004348755, + "num_tokens": 877979266.0, + "step": 23012 + }, + { + "epoch": 2.927490141203409, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8803348541259766, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8699030876159668, + "num_tokens": 878019762.0, + "step": 23013 + }, + { + "epoch": 2.9276173514819996, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8929051160812378, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8597548604011536, + "num_tokens": 878062174.0, + "step": 23014 + }, + { + "epoch": 2.92774456176059, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9642300605773926, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8594573736190796, + "num_tokens": 878098197.0, + "step": 23015 + }, + { + "epoch": 2.9278717720391807, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0644819736480713, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8717586994171143, + "num_tokens": 878130478.0, + "step": 23016 + }, + { + "epoch": 2.9279989823177712, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.917815923690796, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8488196730613708, + "num_tokens": 878175670.0, + "step": 23017 + }, + { + "epoch": 2.9281261925963618, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.046811819076538, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8535076975822449, + "num_tokens": 878216415.0, + "step": 23018 + }, + { + "epoch": 2.9282534028749523, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.013613700866699, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8641958832740784, + "num_tokens": 878249582.0, + "step": 23019 + }, + { + "epoch": 2.928380613153543, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7921818494796753, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.875998854637146, + "num_tokens": 878290859.0, + "step": 23020 + }, + { + "epoch": 2.9285078234321333, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8909761905670166, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.879442572593689, + "num_tokens": 878322463.0, + "step": 23021 + }, + { + "epoch": 2.928635033710724, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7436519861221313, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8749687671661377, + "num_tokens": 878365754.0, + "step": 23022 + }, + { + "epoch": 2.9287622439893144, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6703484058380127, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8790842294692993, + "num_tokens": 878410325.0, + "step": 23023 + }, + { + "epoch": 2.928889454267905, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.783438801765442, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8701363205909729, + "num_tokens": 878451604.0, + "step": 23024 + }, + { + "epoch": 2.9290166645464955, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0841007232666016, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8686317205429077, + "num_tokens": 878485174.0, + "step": 23025 + }, + { + "epoch": 2.929143874825086, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9273080825805664, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.858767032623291, + "num_tokens": 878525867.0, + "step": 23026 + }, + { + "epoch": 2.9292710851036765, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0414047241210938, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8544825911521912, + "num_tokens": 878561327.0, + "step": 23027 + }, + { + "epoch": 2.929398295382267, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0474307537078857, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8710225820541382, + "num_tokens": 878603045.0, + "step": 23028 + }, + { + "epoch": 2.929525505660857, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.794333577156067, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8720633387565613, + "num_tokens": 878645051.0, + "step": 23029 + }, + { + "epoch": 2.929652715939448, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.000959873199463, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.849065899848938, + "num_tokens": 878682003.0, + "step": 23030 + }, + { + "epoch": 2.929779926218038, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.91521155834198, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8695651888847351, + "num_tokens": 878720509.0, + "step": 23031 + }, + { + "epoch": 2.929907136496629, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.798182725906372, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8719244003295898, + "num_tokens": 878757950.0, + "step": 23032 + }, + { + "epoch": 2.9300343467752192, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7460792064666748, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8585187792778015, + "num_tokens": 878801477.0, + "step": 23033 + }, + { + "epoch": 2.93016155705381, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9221415519714355, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8582125306129456, + "num_tokens": 878838802.0, + "step": 23034 + }, + { + "epoch": 2.9302887673324003, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9971535205841064, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8595166802406311, + "num_tokens": 878875566.0, + "step": 23035 + }, + { + "epoch": 2.930415977610991, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.19006085395813, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8577858209609985, + "num_tokens": 878913463.0, + "step": 23036 + }, + { + "epoch": 2.9305431878895813, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9248493909835815, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8524933457374573, + "num_tokens": 878950769.0, + "step": 23037 + }, + { + "epoch": 2.930670398168172, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8676470518112183, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.875052809715271, + "num_tokens": 878989041.0, + "step": 23038 + }, + { + "epoch": 2.9307976084467624, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9882148504257202, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8739547729492188, + "num_tokens": 879020195.0, + "step": 23039 + }, + { + "epoch": 2.930924818725353, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.89984130859375, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8630157709121704, + "num_tokens": 879057818.0, + "step": 23040 + }, + { + "epoch": 2.9310520290039435, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1117358207702637, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8601659536361694, + "num_tokens": 879094747.0, + "step": 23041 + }, + { + "epoch": 2.931179239282534, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.015965461730957, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8575419187545776, + "num_tokens": 879131127.0, + "step": 23042 + }, + { + "epoch": 2.9313064495611245, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.86357843875885, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8657656311988831, + "num_tokens": 879168541.0, + "step": 23043 + }, + { + "epoch": 2.931433659839715, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0522749423980713, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8612719774246216, + "num_tokens": 879199409.0, + "step": 23044 + }, + { + "epoch": 2.9315608701183056, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9033416509628296, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8791823387145996, + "num_tokens": 879236447.0, + "step": 23045 + }, + { + "epoch": 2.931688080396896, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8092198371887207, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8661158680915833, + "num_tokens": 879275596.0, + "step": 23046 + }, + { + "epoch": 2.9318152906754866, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.890092372894287, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8651574850082397, + "num_tokens": 879315620.0, + "step": 23047 + }, + { + "epoch": 2.931942500954077, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8754620552062988, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8675600290298462, + "num_tokens": 879356220.0, + "step": 23048 + }, + { + "epoch": 2.9320697112326677, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8462753295898438, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8502025604248047, + "num_tokens": 879397544.0, + "step": 23049 + }, + { + "epoch": 2.932196921511258, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0769803524017334, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8755437135696411, + "num_tokens": 879438165.0, + "step": 23050 + }, + { + "epoch": 2.9323241317898487, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.984669804573059, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8681347370147705, + "num_tokens": 879473601.0, + "step": 23051 + }, + { + "epoch": 2.9324513420684393, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8296009302139282, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8587614297866821, + "num_tokens": 879510998.0, + "step": 23052 + }, + { + "epoch": 2.93257855234703, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8954530954360962, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8697835803031921, + "num_tokens": 879551469.0, + "step": 23053 + }, + { + "epoch": 2.93270576262562, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.017571210861206, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8580184578895569, + "num_tokens": 879585620.0, + "step": 23054 + }, + { + "epoch": 2.932832972904211, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.014106512069702, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8755078315734863, + "num_tokens": 879621749.0, + "step": 23055 + }, + { + "epoch": 2.932960183182801, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7837820053100586, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8681020736694336, + "num_tokens": 879664529.0, + "step": 23056 + }, + { + "epoch": 2.933087393461392, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9896697998046875, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8706966042518616, + "num_tokens": 879696293.0, + "step": 23057 + }, + { + "epoch": 2.933214603739982, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8995970487594604, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8711444139480591, + "num_tokens": 879733562.0, + "step": 23058 + }, + { + "epoch": 2.933341814018573, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.828316569328308, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8634127378463745, + "num_tokens": 879771740.0, + "step": 23059 + }, + { + "epoch": 2.933469024297163, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.276862859725952, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8609693050384521, + "num_tokens": 879805578.0, + "step": 23060 + }, + { + "epoch": 2.9335962345757536, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9931179285049438, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8642839193344116, + "num_tokens": 879845175.0, + "step": 23061 + }, + { + "epoch": 2.933723444854344, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8472471237182617, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8777305483818054, + "num_tokens": 879878975.0, + "step": 23062 + }, + { + "epoch": 2.9338506551329346, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.929914951324463, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8576918840408325, + "num_tokens": 879919373.0, + "step": 23063 + }, + { + "epoch": 2.933977865411525, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0584774017333984, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8452225923538208, + "num_tokens": 879953821.0, + "step": 23064 + }, + { + "epoch": 2.9341050756901157, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8426291942596436, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8561980724334717, + "num_tokens": 879993505.0, + "step": 23065 + }, + { + "epoch": 2.934232285968706, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9271762371063232, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8706440925598145, + "num_tokens": 880030124.0, + "step": 23066 + }, + { + "epoch": 2.9343594962472968, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2260072231292725, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8414666652679443, + "num_tokens": 880059480.0, + "step": 23067 + }, + { + "epoch": 2.9344867065258873, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0516891479492188, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8658634424209595, + "num_tokens": 880093402.0, + "step": 23068 + }, + { + "epoch": 2.934613916804478, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.276905059814453, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8629982471466064, + "num_tokens": 880127874.0, + "step": 23069 + }, + { + "epoch": 2.9347411270830683, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9275238513946533, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8751921653747559, + "num_tokens": 880165755.0, + "step": 23070 + }, + { + "epoch": 2.934868337361659, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8888362646102905, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8600050806999207, + "num_tokens": 880203931.0, + "step": 23071 + }, + { + "epoch": 2.9349955476402494, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.421454429626465, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8676415085792542, + "num_tokens": 880242844.0, + "step": 23072 + }, + { + "epoch": 2.93512275791884, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.037853956222534, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8496710062026978, + "num_tokens": 880280068.0, + "step": 23073 + }, + { + "epoch": 2.9352499681974304, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.1001665592193604, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8724356889724731, + "num_tokens": 880314044.0, + "step": 23074 + }, + { + "epoch": 2.935377178476021, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7656131982803345, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8625872135162354, + "num_tokens": 880355077.0, + "step": 23075 + }, + { + "epoch": 2.9355043887546115, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8663123846054077, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.880174994468689, + "num_tokens": 880388906.0, + "step": 23076 + }, + { + "epoch": 2.935631599033202, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8001173734664917, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8767716884613037, + "num_tokens": 880426834.0, + "step": 23077 + }, + { + "epoch": 2.9357588093117926, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9613240957260132, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8639674782752991, + "num_tokens": 880463477.0, + "step": 23078 + }, + { + "epoch": 2.9358860195903826, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0809295177459717, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8481329679489136, + "num_tokens": 880500884.0, + "step": 23079 + }, + { + "epoch": 2.9360132298689736, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.966916561126709, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8603451251983643, + "num_tokens": 880539724.0, + "step": 23080 + }, + { + "epoch": 2.9361404401475637, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9551668167114258, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8582045435905457, + "num_tokens": 880577820.0, + "step": 23081 + }, + { + "epoch": 2.9362676504261547, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9045500755310059, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8687999844551086, + "num_tokens": 880612055.0, + "step": 23082 + }, + { + "epoch": 2.9363948607047448, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 80.51956939697266, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8739051818847656, + "num_tokens": 880654907.0, + "step": 23083 + }, + { + "epoch": 2.9365220709833357, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.922438144683838, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8809293508529663, + "num_tokens": 880692418.0, + "step": 23084 + }, + { + "epoch": 2.936649281261926, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.203740119934082, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8508477807044983, + "num_tokens": 880727742.0, + "step": 23085 + }, + { + "epoch": 2.9367764915405163, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9317175149917603, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8771094083786011, + "num_tokens": 880766673.0, + "step": 23086 + }, + { + "epoch": 2.936903701819107, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8619791269302368, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8577268719673157, + "num_tokens": 880810390.0, + "step": 23087 + }, + { + "epoch": 2.9370309120976974, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8201212882995605, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8606809377670288, + "num_tokens": 880846920.0, + "step": 23088 + }, + { + "epoch": 2.937158122376288, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.849610447883606, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8716763257980347, + "num_tokens": 880885867.0, + "step": 23089 + }, + { + "epoch": 2.9372853326548785, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.803120732307434, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8763604760169983, + "num_tokens": 880928408.0, + "step": 23090 + }, + { + "epoch": 2.937412542933469, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0182743072509766, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8643316030502319, + "num_tokens": 880960681.0, + "step": 23091 + }, + { + "epoch": 2.9375397532120595, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1483421325683594, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.855343222618103, + "num_tokens": 881000358.0, + "step": 23092 + }, + { + "epoch": 2.93766696349065, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9299440383911133, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8667116761207581, + "num_tokens": 881037765.0, + "step": 23093 + }, + { + "epoch": 2.9377941737692406, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8375128507614136, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8792312145233154, + "num_tokens": 881076091.0, + "step": 23094 + }, + { + "epoch": 2.937921384047831, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9450759887695312, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8638153076171875, + "num_tokens": 881112784.0, + "step": 23095 + }, + { + "epoch": 2.9380485943264216, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0568830966949463, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8718522191047668, + "num_tokens": 881149938.0, + "step": 23096 + }, + { + "epoch": 2.938175804605012, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.016080856323242, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8560278415679932, + "num_tokens": 881187541.0, + "step": 23097 + }, + { + "epoch": 2.9383030148836027, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.274768829345703, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8674628138542175, + "num_tokens": 881221920.0, + "step": 23098 + }, + { + "epoch": 2.938430225162193, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8630263805389404, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8505120873451233, + "num_tokens": 881264804.0, + "step": 23099 + }, + { + "epoch": 2.9385574354407837, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9236729145050049, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.859027624130249, + "num_tokens": 881304482.0, + "step": 23100 + }, + { + "epoch": 2.9386846457193743, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0643746852874756, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8565173149108887, + "num_tokens": 881340822.0, + "step": 23101 + }, + { + "epoch": 2.938811855997965, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0403149127960205, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8628772497177124, + "num_tokens": 881374352.0, + "step": 23102 + }, + { + "epoch": 2.9389390662765553, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9596374034881592, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8597735166549683, + "num_tokens": 881413176.0, + "step": 23103 + }, + { + "epoch": 2.9390662765551454, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.049861431121826, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.86072838306427, + "num_tokens": 881454309.0, + "step": 23104 + }, + { + "epoch": 2.9391934868337364, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9202817678451538, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8658161163330078, + "num_tokens": 881490702.0, + "step": 23105 + }, + { + "epoch": 2.9393206971123265, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.753419041633606, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.872552216053009, + "num_tokens": 881530394.0, + "step": 23106 + }, + { + "epoch": 2.9394479073909174, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9019746780395508, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8784871697425842, + "num_tokens": 881567286.0, + "step": 23107 + }, + { + "epoch": 2.9395751176695075, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.912767767906189, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8609615564346313, + "num_tokens": 881607017.0, + "step": 23108 + }, + { + "epoch": 2.939702327948098, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7380681037902832, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8649975061416626, + "num_tokens": 881645793.0, + "step": 23109 + }, + { + "epoch": 2.9398295382266886, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.846671223640442, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8680163621902466, + "num_tokens": 881683596.0, + "step": 23110 + }, + { + "epoch": 2.939956748505279, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1169512271881104, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8489069938659668, + "num_tokens": 881717607.0, + "step": 23111 + }, + { + "epoch": 2.9400839587838696, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9793812036514282, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8789896965026855, + "num_tokens": 881750884.0, + "step": 23112 + }, + { + "epoch": 2.94021116906246, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0639030933380127, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8608561754226685, + "num_tokens": 881789345.0, + "step": 23113 + }, + { + "epoch": 2.9403383793410507, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.899296522140503, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8604583740234375, + "num_tokens": 881828459.0, + "step": 23114 + }, + { + "epoch": 2.940465589619641, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9479385614395142, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8584861159324646, + "num_tokens": 881865294.0, + "step": 23115 + }, + { + "epoch": 2.9405927998982317, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8199602365493774, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.868768572807312, + "num_tokens": 881904936.0, + "step": 23116 + }, + { + "epoch": 2.9407200101768223, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0869204998016357, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8757758140563965, + "num_tokens": 881936300.0, + "step": 23117 + }, + { + "epoch": 2.940847220455413, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8734157085418701, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8764239549636841, + "num_tokens": 881974092.0, + "step": 23118 + }, + { + "epoch": 2.9409744307340033, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9622139930725098, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8666683435440063, + "num_tokens": 882019339.0, + "step": 23119 + }, + { + "epoch": 2.941101641012594, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.3469836711883545, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8688206672668457, + "num_tokens": 882052837.0, + "step": 23120 + }, + { + "epoch": 2.9412288512911844, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9638350009918213, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8705466985702515, + "num_tokens": 882088449.0, + "step": 23121 + }, + { + "epoch": 2.941356061569775, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.012971878051758, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8575906753540039, + "num_tokens": 882124844.0, + "step": 23122 + }, + { + "epoch": 2.9414832718483654, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8829443454742432, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.870175838470459, + "num_tokens": 882162246.0, + "step": 23123 + }, + { + "epoch": 2.941610482126956, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.086984634399414, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8552999496459961, + "num_tokens": 882202825.0, + "step": 23124 + }, + { + "epoch": 2.9417376924055465, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9974242448806763, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8503919839859009, + "num_tokens": 882237951.0, + "step": 23125 + }, + { + "epoch": 2.941864902684137, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 16.610578536987305, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8623504042625427, + "num_tokens": 882279147.0, + "step": 23126 + }, + { + "epoch": 2.941992112962727, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0360636711120605, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8866541981697083, + "num_tokens": 882322777.0, + "step": 23127 + }, + { + "epoch": 2.942119323241318, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.049795389175415, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8696914911270142, + "num_tokens": 882360681.0, + "step": 23128 + }, + { + "epoch": 2.942246533519908, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8855724334716797, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8435246348381042, + "num_tokens": 882403823.0, + "step": 23129 + }, + { + "epoch": 2.942373743798499, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.816396951675415, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8736975789070129, + "num_tokens": 882443723.0, + "step": 23130 + }, + { + "epoch": 2.9425009540770892, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.950016736984253, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.849405825138092, + "num_tokens": 882480610.0, + "step": 23131 + }, + { + "epoch": 2.94262816435568, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9955835342407227, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8672534823417664, + "num_tokens": 882518789.0, + "step": 23132 + }, + { + "epoch": 2.9427553746342703, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9782856702804565, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8532037138938904, + "num_tokens": 882556598.0, + "step": 23133 + }, + { + "epoch": 2.942882584912861, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8149572610855103, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8591788411140442, + "num_tokens": 882593788.0, + "step": 23134 + }, + { + "epoch": 2.9430097951914513, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9265739917755127, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8711074590682983, + "num_tokens": 882630861.0, + "step": 23135 + }, + { + "epoch": 2.943137005470042, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.039612054824829, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8728227615356445, + "num_tokens": 882670950.0, + "step": 23136 + }, + { + "epoch": 2.9432642157486324, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1357853412628174, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8545131683349609, + "num_tokens": 882705680.0, + "step": 23137 + }, + { + "epoch": 2.943391426027223, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 6.504669189453125, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8742150068283081, + "num_tokens": 882737589.0, + "step": 23138 + }, + { + "epoch": 2.9435186363058135, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0783004760742188, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.849894642829895, + "num_tokens": 882774502.0, + "step": 23139 + }, + { + "epoch": 2.943645846584404, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9771515130996704, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8551657199859619, + "num_tokens": 882815184.0, + "step": 23140 + }, + { + "epoch": 2.9437730568629945, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8257161378860474, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8760072588920593, + "num_tokens": 882856793.0, + "step": 23141 + }, + { + "epoch": 2.943900267141585, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8136227130889893, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8767250180244446, + "num_tokens": 882896128.0, + "step": 23142 + }, + { + "epoch": 2.9440274774201756, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0071561336517334, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8720111846923828, + "num_tokens": 882931253.0, + "step": 23143 + }, + { + "epoch": 2.944154687698766, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7561460733413696, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8825188875198364, + "num_tokens": 882975072.0, + "step": 23144 + }, + { + "epoch": 2.9442818979773566, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8502172231674194, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8779670596122742, + "num_tokens": 883013576.0, + "step": 23145 + }, + { + "epoch": 2.944409108255947, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.209686279296875, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8670176267623901, + "num_tokens": 883046052.0, + "step": 23146 + }, + { + "epoch": 2.9445363185345377, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9378037452697754, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8460360169410706, + "num_tokens": 883080398.0, + "step": 23147 + }, + { + "epoch": 2.944663528813128, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8723413944244385, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8564053177833557, + "num_tokens": 883120321.0, + "step": 23148 + }, + { + "epoch": 2.9447907390917187, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.882067084312439, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8719742298126221, + "num_tokens": 883161285.0, + "step": 23149 + }, + { + "epoch": 2.9449179493703093, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8611818552017212, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8642227649688721, + "num_tokens": 883199857.0, + "step": 23150 + }, + { + "epoch": 2.9450451596489, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8063443899154663, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8722860217094421, + "num_tokens": 883238325.0, + "step": 23151 + }, + { + "epoch": 2.94517236992749, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.743725299835205, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8558015823364258, + "num_tokens": 883281814.0, + "step": 23152 + }, + { + "epoch": 2.945299580206081, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7880939245224, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8703157901763916, + "num_tokens": 883317385.0, + "step": 23153 + }, + { + "epoch": 2.945426790484671, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.986207365989685, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8743278980255127, + "num_tokens": 883353498.0, + "step": 23154 + }, + { + "epoch": 2.945554000763262, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7794169187545776, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8730654120445251, + "num_tokens": 883391709.0, + "step": 23155 + }, + { + "epoch": 2.945681211041852, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1068248748779297, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8496588468551636, + "num_tokens": 883426329.0, + "step": 23156 + }, + { + "epoch": 2.945808421320443, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8191611766815186, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8630412817001343, + "num_tokens": 883465797.0, + "step": 23157 + }, + { + "epoch": 2.945935631599033, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3058621883392334, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8667658567428589, + "num_tokens": 883501632.0, + "step": 23158 + }, + { + "epoch": 2.9460628418776236, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9305708408355713, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8753241300582886, + "num_tokens": 883537622.0, + "step": 23159 + }, + { + "epoch": 2.946190052156214, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7741725444793701, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8659054040908813, + "num_tokens": 883576458.0, + "step": 23160 + }, + { + "epoch": 2.9463172624348046, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.797534465789795, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8562489151954651, + "num_tokens": 883620929.0, + "step": 23161 + }, + { + "epoch": 2.946444472713395, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9805593490600586, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8416738510131836, + "num_tokens": 883652589.0, + "step": 23162 + }, + { + "epoch": 2.9465716829919857, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1040444374084473, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8668407201766968, + "num_tokens": 883691844.0, + "step": 23163 + }, + { + "epoch": 2.946698893270576, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9169749021530151, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.845649003982544, + "num_tokens": 883732838.0, + "step": 23164 + }, + { + "epoch": 2.9468261035491667, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 16.602140426635742, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8636008501052856, + "num_tokens": 883771415.0, + "step": 23165 + }, + { + "epoch": 2.9469533138277573, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.2157974243164062, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8516113758087158, + "num_tokens": 883808807.0, + "step": 23166 + }, + { + "epoch": 2.947080524106348, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0069222450256348, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8792364597320557, + "num_tokens": 883844300.0, + "step": 23167 + }, + { + "epoch": 2.9472077343849383, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8269541263580322, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8805093765258789, + "num_tokens": 883879337.0, + "step": 23168 + }, + { + "epoch": 2.947334944663529, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.7006616592407227, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8552908897399902, + "num_tokens": 883925810.0, + "step": 23169 + }, + { + "epoch": 2.9474621549421194, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7710896730422974, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8748969435691833, + "num_tokens": 883964753.0, + "step": 23170 + }, + { + "epoch": 2.94758936522071, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7874078750610352, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.857537567615509, + "num_tokens": 884005737.0, + "step": 23171 + }, + { + "epoch": 2.9477165754993004, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.95067298412323, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8659650087356567, + "num_tokens": 884043414.0, + "step": 23172 + }, + { + "epoch": 2.947843785777891, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9543397426605225, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8701795339584351, + "num_tokens": 884077399.0, + "step": 23173 + }, + { + "epoch": 2.9479709960564815, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.283498764038086, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8643676042556763, + "num_tokens": 884116539.0, + "step": 23174 + }, + { + "epoch": 2.948098206335072, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.978400468826294, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8725131154060364, + "num_tokens": 884149756.0, + "step": 23175 + }, + { + "epoch": 2.9482254166136626, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.005038022994995, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8852039575576782, + "num_tokens": 884181960.0, + "step": 23176 + }, + { + "epoch": 2.9483526268922526, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.996026635169983, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8603052496910095, + "num_tokens": 884223562.0, + "step": 23177 + }, + { + "epoch": 2.9484798371708436, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 3.925245523452759, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8699766993522644, + "num_tokens": 884264541.0, + "step": 23178 + }, + { + "epoch": 2.9486070474494337, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.3127005100250244, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8514576554298401, + "num_tokens": 884301171.0, + "step": 23179 + }, + { + "epoch": 2.9487342577280247, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.035341262817383, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8638623952865601, + "num_tokens": 884332481.0, + "step": 23180 + }, + { + "epoch": 2.9488614680066147, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.780739188194275, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8861608505249023, + "num_tokens": 884371616.0, + "step": 23181 + }, + { + "epoch": 2.9489886782852053, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8231165409088135, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8548514246940613, + "num_tokens": 884411245.0, + "step": 23182 + }, + { + "epoch": 2.949115888563796, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9432727098464966, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8615179061889648, + "num_tokens": 884445712.0, + "step": 23183 + }, + { + "epoch": 2.9492430988423863, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8060499429702759, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8703314065933228, + "num_tokens": 884484935.0, + "step": 23184 + }, + { + "epoch": 2.949370309120977, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.94340181350708, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8661285638809204, + "num_tokens": 884518714.0, + "step": 23185 + }, + { + "epoch": 2.9494975193995674, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7793551683425903, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8583600521087646, + "num_tokens": 884560430.0, + "step": 23186 + }, + { + "epoch": 2.949624729678158, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9417392015457153, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8569443225860596, + "num_tokens": 884595937.0, + "step": 23187 + }, + { + "epoch": 2.9497519399567484, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.022714376449585, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8607211709022522, + "num_tokens": 884633201.0, + "step": 23188 + }, + { + "epoch": 2.949879150235339, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9350779056549072, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8681526780128479, + "num_tokens": 884672602.0, + "step": 23189 + }, + { + "epoch": 2.9500063605139295, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.025784492492676, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8674685955047607, + "num_tokens": 884708378.0, + "step": 23190 + }, + { + "epoch": 2.95013357079252, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.858577847480774, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8612880706787109, + "num_tokens": 884747527.0, + "step": 23191 + }, + { + "epoch": 2.9502607810711106, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.053032159805298, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8845497965812683, + "num_tokens": 884781738.0, + "step": 23192 + }, + { + "epoch": 2.950387991349701, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9328370094299316, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8513957858085632, + "num_tokens": 884818884.0, + "step": 23193 + }, + { + "epoch": 2.9505152016282916, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.884392023086548, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8752985000610352, + "num_tokens": 884851751.0, + "step": 23194 + }, + { + "epoch": 2.950642411906882, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8788319826126099, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.868714451789856, + "num_tokens": 884892935.0, + "step": 23195 + }, + { + "epoch": 2.9507696221854727, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.49517822265625, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8704482316970825, + "num_tokens": 884935820.0, + "step": 23196 + }, + { + "epoch": 2.950896832464063, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 16.607067108154297, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8632283806800842, + "num_tokens": 884976042.0, + "step": 23197 + }, + { + "epoch": 2.9510240427426537, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8342435359954834, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8846067786216736, + "num_tokens": 885013511.0, + "step": 23198 + }, + { + "epoch": 2.9511512530212443, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.197781801223755, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8531616926193237, + "num_tokens": 885047291.0, + "step": 23199 + }, + { + "epoch": 2.951278463299835, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9267476797103882, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8619809150695801, + "num_tokens": 885083116.0, + "step": 23200 + }, + { + "epoch": 2.9514056735784253, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8237043619155884, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8620131015777588, + "num_tokens": 885123282.0, + "step": 23201 + }, + { + "epoch": 2.9515328838570154, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.790073037147522, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8696223497390747, + "num_tokens": 885166277.0, + "step": 23202 + }, + { + "epoch": 2.9516600941356064, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7021361589431763, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8767437934875488, + "num_tokens": 885206414.0, + "step": 23203 + }, + { + "epoch": 2.9517873044141965, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.034517765045166, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.874049186706543, + "num_tokens": 885240977.0, + "step": 23204 + }, + { + "epoch": 2.9519145146927874, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3659002780914307, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8822795748710632, + "num_tokens": 885283190.0, + "step": 23205 + }, + { + "epoch": 2.9520417249713775, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8506991863250732, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8789401054382324, + "num_tokens": 885321340.0, + "step": 23206 + }, + { + "epoch": 2.952168935249968, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8686472177505493, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8736551403999329, + "num_tokens": 885357524.0, + "step": 23207 + }, + { + "epoch": 2.9522961455285586, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.100984811782837, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8406449556350708, + "num_tokens": 885389696.0, + "step": 23208 + }, + { + "epoch": 2.952423355807149, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8117928504943848, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8806725144386292, + "num_tokens": 885429574.0, + "step": 23209 + }, + { + "epoch": 2.9525505660857396, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.105527877807617, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8681595325469971, + "num_tokens": 885459961.0, + "step": 23210 + }, + { + "epoch": 2.95267777636433, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7905693054199219, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8593795299530029, + "num_tokens": 885506725.0, + "step": 23211 + }, + { + "epoch": 2.9528049866429207, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.093312978744507, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8694197535514832, + "num_tokens": 885549409.0, + "step": 23212 + }, + { + "epoch": 2.952932196921511, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8726117610931396, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8720576763153076, + "num_tokens": 885589272.0, + "step": 23213 + }, + { + "epoch": 2.9530594072001017, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.060900926589966, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8495189547538757, + "num_tokens": 885625951.0, + "step": 23214 + }, + { + "epoch": 2.9531866174786923, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9567986726760864, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8695534467697144, + "num_tokens": 885665844.0, + "step": 23215 + }, + { + "epoch": 2.953313827757283, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9061753749847412, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8647840023040771, + "num_tokens": 885705616.0, + "step": 23216 + }, + { + "epoch": 2.9534410380358733, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6497021913528442, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8767255544662476, + "num_tokens": 885752362.0, + "step": 23217 + }, + { + "epoch": 2.953568248314464, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7454270124435425, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8606472015380859, + "num_tokens": 885792485.0, + "step": 23218 + }, + { + "epoch": 2.9536954585930544, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7286475896835327, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.867544412612915, + "num_tokens": 885834809.0, + "step": 23219 + }, + { + "epoch": 2.953822668871645, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8927948474884033, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8695989847183228, + "num_tokens": 885870555.0, + "step": 23220 + }, + { + "epoch": 2.9539498791502354, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8288519382476807, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8681310415267944, + "num_tokens": 885911911.0, + "step": 23221 + }, + { + "epoch": 2.954077089428826, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0598976612091064, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8790654540061951, + "num_tokens": 885941094.0, + "step": 23222 + }, + { + "epoch": 2.9542042997074165, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0188803672790527, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8644603490829468, + "num_tokens": 885978094.0, + "step": 23223 + }, + { + "epoch": 2.954331509986007, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9101406335830688, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8825134038925171, + "num_tokens": 886014859.0, + "step": 23224 + }, + { + "epoch": 2.954458720264597, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.860316276550293, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.873967170715332, + "num_tokens": 886048317.0, + "step": 23225 + }, + { + "epoch": 2.954585930543188, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.036046028137207, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8685590624809265, + "num_tokens": 886081797.0, + "step": 23226 + }, + { + "epoch": 2.954713140821778, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7681156396865845, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8526740074157715, + "num_tokens": 886124111.0, + "step": 23227 + }, + { + "epoch": 2.954840351100369, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7844429016113281, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8773391246795654, + "num_tokens": 886162823.0, + "step": 23228 + }, + { + "epoch": 2.954967561378959, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1758928298950195, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8676266670227051, + "num_tokens": 886204706.0, + "step": 23229 + }, + { + "epoch": 2.95509477165755, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8503938913345337, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8752914667129517, + "num_tokens": 886241461.0, + "step": 23230 + }, + { + "epoch": 2.9552219819361403, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8260318040847778, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8615946769714355, + "num_tokens": 886279276.0, + "step": 23231 + }, + { + "epoch": 2.955349192214731, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8491824865341187, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8671796321868896, + "num_tokens": 886317703.0, + "step": 23232 + }, + { + "epoch": 2.9554764024933213, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.07702898979187, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8736907839775085, + "num_tokens": 886355235.0, + "step": 23233 + }, + { + "epoch": 2.955603612771912, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8751585483551025, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8903790712356567, + "num_tokens": 886387354.0, + "step": 23234 + }, + { + "epoch": 2.9557308230505024, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0984513759613037, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8648061752319336, + "num_tokens": 886419742.0, + "step": 23235 + }, + { + "epoch": 2.955858033329093, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8008726835250854, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.868107795715332, + "num_tokens": 886455218.0, + "step": 23236 + }, + { + "epoch": 2.9559852436076834, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8264390230178833, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8675885200500488, + "num_tokens": 886494454.0, + "step": 23237 + }, + { + "epoch": 2.956112453886274, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.954643726348877, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8428124189376831, + "num_tokens": 886532854.0, + "step": 23238 + }, + { + "epoch": 2.9562396641648645, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9257442951202393, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8628871440887451, + "num_tokens": 886564288.0, + "step": 23239 + }, + { + "epoch": 2.956366874443455, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9154069423675537, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8682886362075806, + "num_tokens": 886597155.0, + "step": 23240 + }, + { + "epoch": 2.9564940847220456, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3584256172180176, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8697385191917419, + "num_tokens": 886632015.0, + "step": 23241 + }, + { + "epoch": 2.956621295000636, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7562655210494995, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8836934566497803, + "num_tokens": 886665736.0, + "step": 23242 + }, + { + "epoch": 2.9567485052792266, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9569426774978638, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.866269052028656, + "num_tokens": 886702652.0, + "step": 23243 + }, + { + "epoch": 2.956875715557817, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8385999202728271, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8637585639953613, + "num_tokens": 886743573.0, + "step": 23244 + }, + { + "epoch": 2.9570029258364077, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8853200674057007, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8709672689437866, + "num_tokens": 886777932.0, + "step": 23245 + }, + { + "epoch": 2.957130136114998, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.031660795211792, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8554952144622803, + "num_tokens": 886813211.0, + "step": 23246 + }, + { + "epoch": 2.9572573463935887, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.906156063079834, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8704501390457153, + "num_tokens": 886849416.0, + "step": 23247 + }, + { + "epoch": 2.9573845566721793, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8021907806396484, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8512362837791443, + "num_tokens": 886887983.0, + "step": 23248 + }, + { + "epoch": 2.95751176695077, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.832136631011963, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8493171334266663, + "num_tokens": 886930867.0, + "step": 23249 + }, + { + "epoch": 2.95763897722936, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8427469730377197, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8654348850250244, + "num_tokens": 886971862.0, + "step": 23250 + }, + { + "epoch": 2.957766187507951, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9371963739395142, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8717786073684692, + "num_tokens": 887005238.0, + "step": 23251 + }, + { + "epoch": 2.957893397786541, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0755605697631836, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8585591912269592, + "num_tokens": 887035588.0, + "step": 23252 + }, + { + "epoch": 2.958020608065132, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7691634893417358, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8681017756462097, + "num_tokens": 887079962.0, + "step": 23253 + }, + { + "epoch": 2.958147818343722, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8457623720169067, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8866428136825562, + "num_tokens": 887114572.0, + "step": 23254 + }, + { + "epoch": 2.958275028622313, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8790298700332642, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.870398759841919, + "num_tokens": 887149396.0, + "step": 23255 + }, + { + "epoch": 2.958402238900903, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2178308963775635, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8608493804931641, + "num_tokens": 887191916.0, + "step": 23256 + }, + { + "epoch": 2.9585294491794936, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.84073007106781, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8757741451263428, + "num_tokens": 887228263.0, + "step": 23257 + }, + { + "epoch": 2.958656659458084, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9364045858383179, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8830796480178833, + "num_tokens": 887258688.0, + "step": 23258 + }, + { + "epoch": 2.9587838697366746, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.067208766937256, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8618243932723999, + "num_tokens": 887301565.0, + "step": 23259 + }, + { + "epoch": 2.958911080015265, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8245409727096558, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8628754615783691, + "num_tokens": 887337511.0, + "step": 23260 + }, + { + "epoch": 2.9590382902938557, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8964029550552368, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8702646493911743, + "num_tokens": 887370694.0, + "step": 23261 + }, + { + "epoch": 2.959165500572446, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.939022183418274, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.863681435585022, + "num_tokens": 887412131.0, + "step": 23262 + }, + { + "epoch": 2.9592927108510367, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9026901721954346, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8623167872428894, + "num_tokens": 887447179.0, + "step": 23263 + }, + { + "epoch": 2.9594199211296273, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9042167663574219, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8607795834541321, + "num_tokens": 887486079.0, + "step": 23264 + }, + { + "epoch": 2.959547131408218, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9620305299758911, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.875489354133606, + "num_tokens": 887519092.0, + "step": 23265 + }, + { + "epoch": 2.9596743416868083, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.95848548412323, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.86767578125, + "num_tokens": 887552008.0, + "step": 23266 + }, + { + "epoch": 2.959801551965399, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.985689640045166, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8617902994155884, + "num_tokens": 887585827.0, + "step": 23267 + }, + { + "epoch": 2.9599287622439894, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8652665615081787, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8646917343139648, + "num_tokens": 887628041.0, + "step": 23268 + }, + { + "epoch": 2.96005597252258, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.1637120246887207, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.871376633644104, + "num_tokens": 887667210.0, + "step": 23269 + }, + { + "epoch": 2.9601831828011704, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8197638988494873, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8717910647392273, + "num_tokens": 887705429.0, + "step": 23270 + }, + { + "epoch": 2.960310393079761, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7647544145584106, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8482783436775208, + "num_tokens": 887753188.0, + "step": 23271 + }, + { + "epoch": 2.9604376033583515, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9457505941390991, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8499962091445923, + "num_tokens": 887795907.0, + "step": 23272 + }, + { + "epoch": 2.960564813636942, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.900447964668274, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8673532009124756, + "num_tokens": 887834859.0, + "step": 23273 + }, + { + "epoch": 2.9606920239155325, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7299131155014038, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8623343110084534, + "num_tokens": 887879041.0, + "step": 23274 + }, + { + "epoch": 2.9608192341941226, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.025805950164795, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8611205816268921, + "num_tokens": 887913752.0, + "step": 23275 + }, + { + "epoch": 2.9609464444727136, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9743834733963013, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8710267543792725, + "num_tokens": 887950946.0, + "step": 23276 + }, + { + "epoch": 2.9610736547513037, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9533404111862183, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8630548119544983, + "num_tokens": 887993155.0, + "step": 23277 + }, + { + "epoch": 2.9612008650298947, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.40726637840271, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8652191162109375, + "num_tokens": 888027831.0, + "step": 23278 + }, + { + "epoch": 2.9613280753084847, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7835545539855957, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8754222989082336, + "num_tokens": 888066941.0, + "step": 23279 + }, + { + "epoch": 2.9614552855870753, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9702180624008179, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8796289563179016, + "num_tokens": 888099739.0, + "step": 23280 + }, + { + "epoch": 2.961582495865666, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9116443395614624, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8671372532844543, + "num_tokens": 888139675.0, + "step": 23281 + }, + { + "epoch": 2.9617097061442563, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8543715476989746, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.854123055934906, + "num_tokens": 888179374.0, + "step": 23282 + }, + { + "epoch": 2.961836916422847, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.033775568008423, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8539906740188599, + "num_tokens": 888214139.0, + "step": 23283 + }, + { + "epoch": 2.9619641267014374, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1739652156829834, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8643233776092529, + "num_tokens": 888246365.0, + "step": 23284 + }, + { + "epoch": 2.962091336980028, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8715399503707886, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8605607151985168, + "num_tokens": 888290903.0, + "step": 23285 + }, + { + "epoch": 2.9622185472586184, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9419441223144531, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.864269495010376, + "num_tokens": 888331329.0, + "step": 23286 + }, + { + "epoch": 2.962345757537209, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8512967824935913, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8590610027313232, + "num_tokens": 888372411.0, + "step": 23287 + }, + { + "epoch": 2.9624729678157995, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8855780363082886, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8656431436538696, + "num_tokens": 888409778.0, + "step": 23288 + }, + { + "epoch": 2.96260017809439, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.077742576599121, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8723630309104919, + "num_tokens": 888453593.0, + "step": 23289 + }, + { + "epoch": 2.9627273883729806, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9363118410110474, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8782167434692383, + "num_tokens": 888488245.0, + "step": 23290 + }, + { + "epoch": 2.962854598651571, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.146015167236328, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8657239079475403, + "num_tokens": 888524306.0, + "step": 23291 + }, + { + "epoch": 2.9629818089301616, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8320081233978271, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8783696889877319, + "num_tokens": 888563791.0, + "step": 23292 + }, + { + "epoch": 2.963109019208752, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8613137006759644, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8704550266265869, + "num_tokens": 888605021.0, + "step": 23293 + }, + { + "epoch": 2.9632362294873427, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8094087839126587, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8668918609619141, + "num_tokens": 888647382.0, + "step": 23294 + }, + { + "epoch": 2.963363439765933, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9148160219192505, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8655523061752319, + "num_tokens": 888682174.0, + "step": 23295 + }, + { + "epoch": 2.9634906500445237, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7213221788406372, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8677881956100464, + "num_tokens": 888724401.0, + "step": 23296 + }, + { + "epoch": 2.9636178603231143, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.781786322593689, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8729636669158936, + "num_tokens": 888766230.0, + "step": 23297 + }, + { + "epoch": 2.963745070601705, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.727765679359436, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8448162078857422, + "num_tokens": 888811865.0, + "step": 23298 + }, + { + "epoch": 2.9638722808802953, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8161910772323608, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8677375316619873, + "num_tokens": 888849549.0, + "step": 23299 + }, + { + "epoch": 2.9639994911588854, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.055079221725464, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8688169717788696, + "num_tokens": 888881305.0, + "step": 23300 + }, + { + "epoch": 2.9641267014374764, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.963354468345642, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8708293437957764, + "num_tokens": 888917862.0, + "step": 23301 + }, + { + "epoch": 2.9642539117160664, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9156402349472046, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8673550486564636, + "num_tokens": 888953883.0, + "step": 23302 + }, + { + "epoch": 2.9643811219946574, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9964485168457031, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8841761946678162, + "num_tokens": 888991052.0, + "step": 23303 + }, + { + "epoch": 2.9645083322732475, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.052166223526001, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8773999214172363, + "num_tokens": 889022523.0, + "step": 23304 + }, + { + "epoch": 2.964635542551838, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8774769306182861, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8571922779083252, + "num_tokens": 889063470.0, + "step": 23305 + }, + { + "epoch": 2.9647627528304286, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9307911396026611, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8752830624580383, + "num_tokens": 889101160.0, + "step": 23306 + }, + { + "epoch": 2.964889963109019, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.888792634010315, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8640555739402771, + "num_tokens": 889139128.0, + "step": 23307 + }, + { + "epoch": 2.9650171733876096, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9151264429092407, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8426617383956909, + "num_tokens": 889175574.0, + "step": 23308 + }, + { + "epoch": 2.9651443836662, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8777873516082764, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.86345374584198, + "num_tokens": 889216520.0, + "step": 23309 + }, + { + "epoch": 2.9652715939447907, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0264084339141846, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8780288696289062, + "num_tokens": 889256746.0, + "step": 23310 + }, + { + "epoch": 2.965398804223381, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.803449273109436, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8615691661834717, + "num_tokens": 889298868.0, + "step": 23311 + }, + { + "epoch": 2.9655260145019717, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9260812997817993, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8618067502975464, + "num_tokens": 889337758.0, + "step": 23312 + }, + { + "epoch": 2.9656532247805623, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7837070226669312, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.872591495513916, + "num_tokens": 889375308.0, + "step": 23313 + }, + { + "epoch": 2.965780435059153, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8284106254577637, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8742168545722961, + "num_tokens": 889413735.0, + "step": 23314 + }, + { + "epoch": 2.9659076453377433, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8428776264190674, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8676723837852478, + "num_tokens": 889451585.0, + "step": 23315 + }, + { + "epoch": 2.966034855616334, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9777296781539917, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8494477868080139, + "num_tokens": 889487405.0, + "step": 23316 + }, + { + "epoch": 2.9661620658949244, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.969655156135559, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8540228605270386, + "num_tokens": 889523051.0, + "step": 23317 + }, + { + "epoch": 2.966289276173515, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.018249988555908, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8476183414459229, + "num_tokens": 889559822.0, + "step": 23318 + }, + { + "epoch": 2.9664164864521054, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0572972297668457, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8640754818916321, + "num_tokens": 889591598.0, + "step": 23319 + }, + { + "epoch": 2.966543696730696, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8385071754455566, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8768681287765503, + "num_tokens": 889629787.0, + "step": 23320 + }, + { + "epoch": 2.9666709070092865, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7711833715438843, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8735754489898682, + "num_tokens": 889668730.0, + "step": 23321 + }, + { + "epoch": 2.966798117287877, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.828385591506958, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8655097484588623, + "num_tokens": 889705215.0, + "step": 23322 + }, + { + "epoch": 2.966925327566467, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.181562900543213, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8682471513748169, + "num_tokens": 889738528.0, + "step": 23323 + }, + { + "epoch": 2.967052537845058, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.1584103107452393, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8682238459587097, + "num_tokens": 889771694.0, + "step": 23324 + }, + { + "epoch": 2.967179748123648, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9852023124694824, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.879770040512085, + "num_tokens": 889806096.0, + "step": 23325 + }, + { + "epoch": 2.967306958402239, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9469417333602905, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8671025037765503, + "num_tokens": 889848514.0, + "step": 23326 + }, + { + "epoch": 2.967434168680829, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.765973448753357, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8869822025299072, + "num_tokens": 889889689.0, + "step": 23327 + }, + { + "epoch": 2.96756137895942, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9398531913757324, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8703914880752563, + "num_tokens": 889925550.0, + "step": 23328 + }, + { + "epoch": 2.9676885892380103, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.940908432006836, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8515855073928833, + "num_tokens": 889961797.0, + "step": 23329 + }, + { + "epoch": 2.967815799516601, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.001295804977417, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8720548748970032, + "num_tokens": 889997935.0, + "step": 23330 + }, + { + "epoch": 2.9679430097951913, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.3574419021606445, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8775841593742371, + "num_tokens": 890037900.0, + "step": 23331 + }, + { + "epoch": 2.968070220073782, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9387060403823853, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.873225748538971, + "num_tokens": 890074814.0, + "step": 23332 + }, + { + "epoch": 2.9681974303523724, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9028862714767456, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8798894882202148, + "num_tokens": 890107091.0, + "step": 23333 + }, + { + "epoch": 2.968324640630963, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.942640781402588, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8753951787948608, + "num_tokens": 890140558.0, + "step": 23334 + }, + { + "epoch": 2.9684518509095534, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.004887342453003, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8660462498664856, + "num_tokens": 890174563.0, + "step": 23335 + }, + { + "epoch": 2.968579061188144, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9231107234954834, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8640832901000977, + "num_tokens": 890210584.0, + "step": 23336 + }, + { + "epoch": 2.9687062714667345, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.808761715888977, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8805405497550964, + "num_tokens": 890251955.0, + "step": 23337 + }, + { + "epoch": 2.968833481745325, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9053552150726318, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8622080087661743, + "num_tokens": 890289095.0, + "step": 23338 + }, + { + "epoch": 2.9689606920239155, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8020861148834229, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.868388295173645, + "num_tokens": 890327864.0, + "step": 23339 + }, + { + "epoch": 2.969087902302506, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.92044198513031, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8663802146911621, + "num_tokens": 890364244.0, + "step": 23340 + }, + { + "epoch": 2.9692151125810966, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0657126903533936, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8533123731613159, + "num_tokens": 890397652.0, + "step": 23341 + }, + { + "epoch": 2.969342322859687, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.836336374282837, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8718242049217224, + "num_tokens": 890435598.0, + "step": 23342 + }, + { + "epoch": 2.9694695331382777, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.878117561340332, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8703781962394714, + "num_tokens": 890473445.0, + "step": 23343 + }, + { + "epoch": 2.969596743416868, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.761759877204895, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8705339431762695, + "num_tokens": 890514263.0, + "step": 23344 + }, + { + "epoch": 2.9697239536954587, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.809260368347168, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8717545866966248, + "num_tokens": 890554123.0, + "step": 23345 + }, + { + "epoch": 2.9698511639740492, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.781317114830017, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.866302490234375, + "num_tokens": 890593938.0, + "step": 23346 + }, + { + "epoch": 2.9699783742526398, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8767942190170288, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.858894944190979, + "num_tokens": 890632260.0, + "step": 23347 + }, + { + "epoch": 2.97010558453123, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2306578159332275, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8633654713630676, + "num_tokens": 890666852.0, + "step": 23348 + }, + { + "epoch": 2.970232794809821, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7701290845870972, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8623666763305664, + "num_tokens": 890707959.0, + "step": 23349 + }, + { + "epoch": 2.970360005088411, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8982431888580322, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8453372716903687, + "num_tokens": 890748729.0, + "step": 23350 + }, + { + "epoch": 2.970487215367002, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3298521041870117, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8798268437385559, + "num_tokens": 890782116.0, + "step": 23351 + }, + { + "epoch": 2.970614425645592, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.871273159980774, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8547117710113525, + "num_tokens": 890827309.0, + "step": 23352 + }, + { + "epoch": 2.970741635924183, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0692996978759766, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8568206429481506, + "num_tokens": 890863879.0, + "step": 23353 + }, + { + "epoch": 2.970868846202773, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 3.2819876670837402, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8694633841514587, + "num_tokens": 890901184.0, + "step": 23354 + }, + { + "epoch": 2.9709960564813636, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7952122688293457, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8752725124359131, + "num_tokens": 890937779.0, + "step": 23355 + }, + { + "epoch": 2.971123266759954, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0295934677124023, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.879866898059845, + "num_tokens": 890978055.0, + "step": 23356 + }, + { + "epoch": 2.9712504770385446, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.165179491043091, + "learning_rate": 1e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.8338226079940796, + "num_tokens": 891010985.0, + "step": 23357 + }, + { + "epoch": 2.971377687317135, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9953604936599731, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8727185130119324, + "num_tokens": 891045904.0, + "step": 23358 + }, + { + "epoch": 2.9715048975957257, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7684944868087769, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8710163831710815, + "num_tokens": 891086240.0, + "step": 23359 + }, + { + "epoch": 2.971632107874316, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.932895302772522, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.863178551197052, + "num_tokens": 891126482.0, + "step": 23360 + }, + { + "epoch": 2.9717593181529067, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.860569953918457, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.855387270450592, + "num_tokens": 891165982.0, + "step": 23361 + }, + { + "epoch": 2.9718865284314973, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8525649309158325, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8512710928916931, + "num_tokens": 891199101.0, + "step": 23362 + }, + { + "epoch": 2.972013738710088, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8928824663162231, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.865996241569519, + "num_tokens": 891231511.0, + "step": 23363 + }, + { + "epoch": 2.9721409489886783, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0847392082214355, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8545805215835571, + "num_tokens": 891262369.0, + "step": 23364 + }, + { + "epoch": 2.972268159267269, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9809823036193848, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8515998125076294, + "num_tokens": 891304206.0, + "step": 23365 + }, + { + "epoch": 2.9723953695458594, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8701868057250977, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8539819717407227, + "num_tokens": 891348006.0, + "step": 23366 + }, + { + "epoch": 2.97252257982445, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8182971477508545, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8739385604858398, + "num_tokens": 891386851.0, + "step": 23367 + }, + { + "epoch": 2.9726497901030404, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7548145055770874, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8764196634292603, + "num_tokens": 891423582.0, + "step": 23368 + }, + { + "epoch": 2.972777000381631, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6751103401184082, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8821606040000916, + "num_tokens": 891463773.0, + "step": 23369 + }, + { + "epoch": 2.9729042106602215, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9532150030136108, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8768570423126221, + "num_tokens": 891498117.0, + "step": 23370 + }, + { + "epoch": 2.973031420938812, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9592431783676147, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8750683069229126, + "num_tokens": 891529055.0, + "step": 23371 + }, + { + "epoch": 2.9731586312174025, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7897733449935913, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8756259679794312, + "num_tokens": 891567370.0, + "step": 23372 + }, + { + "epoch": 2.9732858414959926, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.00541353225708, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.870451807975769, + "num_tokens": 891602737.0, + "step": 23373 + }, + { + "epoch": 2.9734130517745836, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.92384672164917, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8604034781455994, + "num_tokens": 891639713.0, + "step": 23374 + }, + { + "epoch": 2.9735402620531737, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9054275751113892, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8791046142578125, + "num_tokens": 891673931.0, + "step": 23375 + }, + { + "epoch": 2.9736674723317646, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.052764892578125, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8644962906837463, + "num_tokens": 891708015.0, + "step": 23376 + }, + { + "epoch": 2.9737946826103547, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8834444284439087, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.863330066204071, + "num_tokens": 891746023.0, + "step": 23377 + }, + { + "epoch": 2.9739218928889453, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.01741099357605, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8639745712280273, + "num_tokens": 891777767.0, + "step": 23378 + }, + { + "epoch": 2.974049103167536, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0250067710876465, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8689966201782227, + "num_tokens": 891810816.0, + "step": 23379 + }, + { + "epoch": 2.9741763134461263, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.860343098640442, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8593919277191162, + "num_tokens": 891848810.0, + "step": 23380 + }, + { + "epoch": 2.974303523724717, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8234344720840454, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8706498146057129, + "num_tokens": 891892553.0, + "step": 23381 + }, + { + "epoch": 2.9744307340033074, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8436201810836792, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.877967357635498, + "num_tokens": 891927539.0, + "step": 23382 + }, + { + "epoch": 2.974557944281898, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.941184401512146, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8699604868888855, + "num_tokens": 891959577.0, + "step": 23383 + }, + { + "epoch": 2.9746851545604884, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9291799068450928, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8396408557891846, + "num_tokens": 892000799.0, + "step": 23384 + }, + { + "epoch": 2.974812364839079, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9508417844772339, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8589714765548706, + "num_tokens": 892042200.0, + "step": 23385 + }, + { + "epoch": 2.9749395751176695, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9828925132751465, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8560907244682312, + "num_tokens": 892084838.0, + "step": 23386 + }, + { + "epoch": 2.97506678539626, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.846828818321228, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8502404689788818, + "num_tokens": 892126442.0, + "step": 23387 + }, + { + "epoch": 2.9751939956748505, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9097399711608887, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8539861440658569, + "num_tokens": 892165019.0, + "step": 23388 + }, + { + "epoch": 2.975321205953441, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8922799825668335, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8696479797363281, + "num_tokens": 892202291.0, + "step": 23389 + }, + { + "epoch": 2.9754484162320316, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.023521900177002, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8561283349990845, + "num_tokens": 892235159.0, + "step": 23390 + }, + { + "epoch": 2.975575626510622, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9700254201889038, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.853615939617157, + "num_tokens": 892271967.0, + "step": 23391 + }, + { + "epoch": 2.9757028367892127, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9503260850906372, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8516780138015747, + "num_tokens": 892306230.0, + "step": 23392 + }, + { + "epoch": 2.975830047067803, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8778903484344482, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8677933216094971, + "num_tokens": 892342118.0, + "step": 23393 + }, + { + "epoch": 2.9759572573463937, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7737098932266235, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.87112957239151, + "num_tokens": 892380764.0, + "step": 23394 + }, + { + "epoch": 2.9760844676249842, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8815490007400513, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8630704879760742, + "num_tokens": 892421113.0, + "step": 23395 + }, + { + "epoch": 2.9762116779035748, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9144221544265747, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8741269707679749, + "num_tokens": 892457512.0, + "step": 23396 + }, + { + "epoch": 2.9763388881821653, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8980121612548828, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8710720539093018, + "num_tokens": 892493599.0, + "step": 23397 + }, + { + "epoch": 2.9764660984607554, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9157657623291016, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8560062646865845, + "num_tokens": 892528474.0, + "step": 23398 + }, + { + "epoch": 2.9765933087393464, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7726696729660034, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8737090229988098, + "num_tokens": 892569591.0, + "step": 23399 + }, + { + "epoch": 2.9767205190179364, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9449726343154907, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8711493611335754, + "num_tokens": 892607185.0, + "step": 23400 + }, + { + "epoch": 2.9768477292965274, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8735944032669067, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8687077760696411, + "num_tokens": 892641453.0, + "step": 23401 + }, + { + "epoch": 2.9769749395751175, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8635191917419434, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8606212735176086, + "num_tokens": 892676802.0, + "step": 23402 + }, + { + "epoch": 2.977102149853708, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 3.004305601119995, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8781042098999023, + "num_tokens": 892711694.0, + "step": 23403 + }, + { + "epoch": 2.9772293601322986, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9022570848464966, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8574540615081787, + "num_tokens": 892752662.0, + "step": 23404 + }, + { + "epoch": 2.977356570410889, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7973885536193848, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8661358952522278, + "num_tokens": 892790170.0, + "step": 23405 + }, + { + "epoch": 2.9774837806894796, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.879921317100525, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8502483367919922, + "num_tokens": 892831437.0, + "step": 23406 + }, + { + "epoch": 2.97761099096807, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.8196942806243896, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8877645134925842, + "num_tokens": 892869022.0, + "step": 23407 + }, + { + "epoch": 2.9777382012466607, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7817274332046509, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8674120306968689, + "num_tokens": 892910738.0, + "step": 23408 + }, + { + "epoch": 2.977865411525251, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.816056728363037, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8746612668037415, + "num_tokens": 892952055.0, + "step": 23409 + }, + { + "epoch": 2.9779926218038417, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.999957799911499, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8579369783401489, + "num_tokens": 892985843.0, + "step": 23410 + }, + { + "epoch": 2.9781198320824323, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7380739450454712, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8556718230247498, + "num_tokens": 893031197.0, + "step": 23411 + }, + { + "epoch": 2.978247042361023, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0155293941497803, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8809322118759155, + "num_tokens": 893068113.0, + "step": 23412 + }, + { + "epoch": 2.9783742526396133, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9138455390930176, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8781554698944092, + "num_tokens": 893109148.0, + "step": 23413 + }, + { + "epoch": 2.978501462918204, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6816352605819702, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8719058036804199, + "num_tokens": 893151406.0, + "step": 23414 + }, + { + "epoch": 2.9786286731967944, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.807126760482788, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8708977699279785, + "num_tokens": 893190209.0, + "step": 23415 + }, + { + "epoch": 2.978755883475385, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8581117391586304, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8662789463996887, + "num_tokens": 893232949.0, + "step": 23416 + }, + { + "epoch": 2.9788830937539754, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7349077463150024, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8663650155067444, + "num_tokens": 893273931.0, + "step": 23417 + }, + { + "epoch": 2.979010304032566, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7893290519714355, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8463056087493896, + "num_tokens": 893317465.0, + "step": 23418 + }, + { + "epoch": 2.9791375143111565, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.951431155204773, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.849939227104187, + "num_tokens": 893354089.0, + "step": 23419 + }, + { + "epoch": 2.979264724589747, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9879893064498901, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8623970150947571, + "num_tokens": 893388051.0, + "step": 23420 + }, + { + "epoch": 2.979391934868337, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9419090747833252, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8610796928405762, + "num_tokens": 893426191.0, + "step": 23421 + }, + { + "epoch": 2.979519145146928, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0293781757354736, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8576267957687378, + "num_tokens": 893462134.0, + "step": 23422 + }, + { + "epoch": 2.979646355425518, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8257687091827393, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8602522611618042, + "num_tokens": 893503159.0, + "step": 23423 + }, + { + "epoch": 2.979773565704109, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1069495677948, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8477467894554138, + "num_tokens": 893538379.0, + "step": 23424 + }, + { + "epoch": 2.979900775982699, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9320578575134277, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.854430079460144, + "num_tokens": 893580431.0, + "step": 23425 + }, + { + "epoch": 2.98002798626129, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.0454139709472656, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8597617745399475, + "num_tokens": 893622412.0, + "step": 23426 + }, + { + "epoch": 2.9801551965398803, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7935818433761597, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8693988919258118, + "num_tokens": 893658575.0, + "step": 23427 + }, + { + "epoch": 2.980282406818471, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7127760648727417, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8731387257575989, + "num_tokens": 893702812.0, + "step": 23428 + }, + { + "epoch": 2.9804096170970613, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.796536922454834, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8582336902618408, + "num_tokens": 893742785.0, + "step": 23429 + }, + { + "epoch": 2.980536827375652, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7460583448410034, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8718634843826294, + "num_tokens": 893782310.0, + "step": 23430 + }, + { + "epoch": 2.9806640376542424, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.833862543106079, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8829740285873413, + "num_tokens": 893822794.0, + "step": 23431 + }, + { + "epoch": 2.980791247932833, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7184386253356934, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8723131418228149, + "num_tokens": 893870462.0, + "step": 23432 + }, + { + "epoch": 2.9809184582114234, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.826770305633545, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8659727573394775, + "num_tokens": 893910654.0, + "step": 23433 + }, + { + "epoch": 2.981045668490014, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9140782356262207, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.86304771900177, + "num_tokens": 893948819.0, + "step": 23434 + }, + { + "epoch": 2.9811728787686045, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.865738868713379, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8721596002578735, + "num_tokens": 893983325.0, + "step": 23435 + }, + { + "epoch": 2.981300089047195, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9559016227722168, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8644236326217651, + "num_tokens": 894023712.0, + "step": 23436 + }, + { + "epoch": 2.9814272993257855, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8836172819137573, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8757642507553101, + "num_tokens": 894059719.0, + "step": 23437 + }, + { + "epoch": 2.981554509604376, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7953004837036133, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8661256432533264, + "num_tokens": 894100150.0, + "step": 23438 + }, + { + "epoch": 2.9816817198829666, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0935373306274414, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.8444516658782959, + "num_tokens": 894134336.0, + "step": 23439 + }, + { + "epoch": 2.981808930161557, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.126420021057129, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8514314889907837, + "num_tokens": 894168972.0, + "step": 23440 + }, + { + "epoch": 2.9819361404401477, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0955073833465576, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.862196683883667, + "num_tokens": 894204641.0, + "step": 23441 + }, + { + "epoch": 2.982063350718738, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.013354778289795, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.867741584777832, + "num_tokens": 894248403.0, + "step": 23442 + }, + { + "epoch": 2.9821905609973287, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.862534999847412, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8824299573898315, + "num_tokens": 894284035.0, + "step": 23443 + }, + { + "epoch": 2.9823177712759192, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.85289466381073, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8701339364051819, + "num_tokens": 894321804.0, + "step": 23444 + }, + { + "epoch": 2.9824449815545098, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.9292762279510498, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8749176859855652, + "num_tokens": 894355648.0, + "step": 23445 + }, + { + "epoch": 2.9825721918331, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8664618730545044, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8660968542098999, + "num_tokens": 894392947.0, + "step": 23446 + }, + { + "epoch": 2.982699402111691, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0681493282318115, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8562929630279541, + "num_tokens": 894433437.0, + "step": 23447 + }, + { + "epoch": 2.982826612390281, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.821376919746399, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8710547685623169, + "num_tokens": 894481428.0, + "step": 23448 + }, + { + "epoch": 2.982953822668872, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9088040590286255, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8663293719291687, + "num_tokens": 894518055.0, + "step": 23449 + }, + { + "epoch": 2.983081032947462, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7758774757385254, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8605425953865051, + "num_tokens": 894562693.0, + "step": 23450 + }, + { + "epoch": 2.983208243226053, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.821136713027954, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8645416498184204, + "num_tokens": 894604973.0, + "step": 23451 + }, + { + "epoch": 2.983335453504643, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 2.176028251647949, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8724592924118042, + "num_tokens": 894645672.0, + "step": 23452 + }, + { + "epoch": 2.9834626637832335, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.043368101119995, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8633431196212769, + "num_tokens": 894686702.0, + "step": 23453 + }, + { + "epoch": 2.983589874061824, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8487329483032227, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8737856149673462, + "num_tokens": 894722768.0, + "step": 23454 + }, + { + "epoch": 2.9837170843404146, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.846848726272583, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8786465525627136, + "num_tokens": 894759216.0, + "step": 23455 + }, + { + "epoch": 2.983844294619005, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8169496059417725, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8710924983024597, + "num_tokens": 894799691.0, + "step": 23456 + }, + { + "epoch": 2.9839715048975957, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.806025505065918, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8621945381164551, + "num_tokens": 894835869.0, + "step": 23457 + }, + { + "epoch": 2.984098715176186, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9041084051132202, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8439987897872925, + "num_tokens": 894875706.0, + "step": 23458 + }, + { + "epoch": 2.9842259254547767, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7831329107284546, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8722378015518188, + "num_tokens": 894920135.0, + "step": 23459 + }, + { + "epoch": 2.9843531357333672, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0327179431915283, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8686655163764954, + "num_tokens": 894960825.0, + "step": 23460 + }, + { + "epoch": 2.9844803460119578, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8649412393569946, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8676372766494751, + "num_tokens": 895001935.0, + "step": 23461 + }, + { + "epoch": 2.9846075562905483, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.899113655090332, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8758697509765625, + "num_tokens": 895033559.0, + "step": 23462 + }, + { + "epoch": 2.984734766569139, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8561770915985107, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8660629987716675, + "num_tokens": 895070958.0, + "step": 23463 + }, + { + "epoch": 2.9848619768477294, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.947174310684204, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8652870655059814, + "num_tokens": 895111533.0, + "step": 23464 + }, + { + "epoch": 2.98498918712632, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0084338188171387, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8630766272544861, + "num_tokens": 895149861.0, + "step": 23465 + }, + { + "epoch": 2.9851163974049104, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.803553819656372, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8730886578559875, + "num_tokens": 895192958.0, + "step": 23466 + }, + { + "epoch": 2.985243607683501, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.141413927078247, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8773844242095947, + "num_tokens": 895226710.0, + "step": 23467 + }, + { + "epoch": 2.9853708179620915, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.909395694732666, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8692673444747925, + "num_tokens": 895267695.0, + "step": 23468 + }, + { + "epoch": 2.985498028240682, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7976938486099243, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8813385367393494, + "num_tokens": 895303101.0, + "step": 23469 + }, + { + "epoch": 2.9856252385192725, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9423017501831055, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8717811107635498, + "num_tokens": 895340437.0, + "step": 23470 + }, + { + "epoch": 2.9857524487978626, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9486265182495117, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8817862868309021, + "num_tokens": 895377695.0, + "step": 23471 + }, + { + "epoch": 2.9858796590764536, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9637912511825562, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8575085401535034, + "num_tokens": 895418599.0, + "step": 23472 + }, + { + "epoch": 2.9860068693550437, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8447943925857544, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8667392730712891, + "num_tokens": 895454832.0, + "step": 23473 + }, + { + "epoch": 2.9861340796336346, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9861345291137695, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8485479354858398, + "num_tokens": 895490607.0, + "step": 23474 + }, + { + "epoch": 2.9862612899122247, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8255794048309326, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8611947298049927, + "num_tokens": 895531340.0, + "step": 23475 + }, + { + "epoch": 2.9863885001908153, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9951903820037842, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8669347167015076, + "num_tokens": 895568688.0, + "step": 23476 + }, + { + "epoch": 2.986515710469406, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8838305473327637, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8560873866081238, + "num_tokens": 895607316.0, + "step": 23477 + }, + { + "epoch": 2.9866429207479963, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7598707675933838, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8833922147750854, + "num_tokens": 895648694.0, + "step": 23478 + }, + { + "epoch": 2.986770131026587, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9166233539581299, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8614908456802368, + "num_tokens": 895687996.0, + "step": 23479 + }, + { + "epoch": 2.9868973413051774, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.795242428779602, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8709279298782349, + "num_tokens": 895726727.0, + "step": 23480 + }, + { + "epoch": 2.987024551583768, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7850141525268555, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.871425986289978, + "num_tokens": 895760688.0, + "step": 23481 + }, + { + "epoch": 2.9871517618623584, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.051424980163574, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8464449644088745, + "num_tokens": 895797303.0, + "step": 23482 + }, + { + "epoch": 2.987278972140949, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9859198331832886, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8718293905258179, + "num_tokens": 895834078.0, + "step": 23483 + }, + { + "epoch": 2.9874061824195395, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9068013429641724, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8553606867790222, + "num_tokens": 895872640.0, + "step": 23484 + }, + { + "epoch": 2.98753339269813, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.0810697078704834, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8809019327163696, + "num_tokens": 895907646.0, + "step": 23485 + }, + { + "epoch": 2.9876606029767205, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.78917396068573, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.875796377658844, + "num_tokens": 895944183.0, + "step": 23486 + }, + { + "epoch": 2.987787813255311, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8824635744094849, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8679698705673218, + "num_tokens": 895983987.0, + "step": 23487 + }, + { + "epoch": 2.9879150235339016, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9477471113204956, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8551697731018066, + "num_tokens": 896020928.0, + "step": 23488 + }, + { + "epoch": 2.988042233812492, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8755888938903809, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8729361295700073, + "num_tokens": 896057459.0, + "step": 23489 + }, + { + "epoch": 2.9881694440910826, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.147091865539551, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8622551560401917, + "num_tokens": 896095049.0, + "step": 23490 + }, + { + "epoch": 2.988296654369673, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.4443776607513428, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.877379298210144, + "num_tokens": 896127433.0, + "step": 23491 + }, + { + "epoch": 2.9884238646482637, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8573219776153564, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8725932240486145, + "num_tokens": 896165662.0, + "step": 23492 + }, + { + "epoch": 2.9885510749268542, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7956187725067139, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8629055023193359, + "num_tokens": 896207753.0, + "step": 23493 + }, + { + "epoch": 2.9886782852054448, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7966290712356567, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8640590906143188, + "num_tokens": 896249105.0, + "step": 23494 + }, + { + "epoch": 2.9888054954840353, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.6369407176971436, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8665515184402466, + "num_tokens": 896291728.0, + "step": 23495 + }, + { + "epoch": 2.9889327057626254, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.081151008605957, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8563281297683716, + "num_tokens": 896331729.0, + "step": 23496 + }, + { + "epoch": 2.9890599160412163, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8130751848220825, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8563394546508789, + "num_tokens": 896371804.0, + "step": 23497 + }, + { + "epoch": 2.9891871263198064, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9889904260635376, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8632389307022095, + "num_tokens": 896408398.0, + "step": 23498 + }, + { + "epoch": 2.9893143365983974, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9542969465255737, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8819657564163208, + "num_tokens": 896440496.0, + "step": 23499 + }, + { + "epoch": 2.9894415468769875, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0189170837402344, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8683409690856934, + "num_tokens": 896475212.0, + "step": 23500 + }, + { + "epoch": 2.989568757155578, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8269882202148438, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8629719018936157, + "num_tokens": 896517122.0, + "step": 23501 + }, + { + "epoch": 2.9896959674341685, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1605618000030518, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8723007440567017, + "num_tokens": 896557259.0, + "step": 23502 + }, + { + "epoch": 2.989823177712759, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.7821242809295654, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8563110828399658, + "num_tokens": 896600237.0, + "step": 23503 + }, + { + "epoch": 2.9899503879913496, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9879969358444214, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8786317110061646, + "num_tokens": 896640564.0, + "step": 23504 + }, + { + "epoch": 2.99007759826994, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.145127058029175, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8772213459014893, + "num_tokens": 896674164.0, + "step": 23505 + }, + { + "epoch": 2.9902048085485307, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.8547513484954834, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8601142764091492, + "num_tokens": 896716613.0, + "step": 23506 + }, + { + "epoch": 2.990332018827121, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8613831996917725, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8556393384933472, + "num_tokens": 896755586.0, + "step": 23507 + }, + { + "epoch": 2.9904592291057117, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.809304118156433, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8639435768127441, + "num_tokens": 896797641.0, + "step": 23508 + }, + { + "epoch": 2.9905864393843022, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9981964826583862, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.862180233001709, + "num_tokens": 896835113.0, + "step": 23509 + }, + { + "epoch": 2.9907136496628928, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9930375814437866, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8653768301010132, + "num_tokens": 896866496.0, + "step": 23510 + }, + { + "epoch": 2.9908408599414833, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9281724691390991, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8634859919548035, + "num_tokens": 896907535.0, + "step": 23511 + }, + { + "epoch": 2.990968070220074, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.813043475151062, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8833889365196228, + "num_tokens": 896948574.0, + "step": 23512 + }, + { + "epoch": 2.9910952804986644, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9529509544372559, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8612722754478455, + "num_tokens": 896992377.0, + "step": 23513 + }, + { + "epoch": 2.991222490777255, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8833978176116943, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8617318272590637, + "num_tokens": 897035411.0, + "step": 23514 + }, + { + "epoch": 2.9913497010558454, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9266839027404785, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8689431548118591, + "num_tokens": 897074661.0, + "step": 23515 + }, + { + "epoch": 2.991476911334436, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7746851444244385, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8685894012451172, + "num_tokens": 897115550.0, + "step": 23516 + }, + { + "epoch": 2.9916041216130265, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8527250289916992, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8594170808792114, + "num_tokens": 897155091.0, + "step": 23517 + }, + { + "epoch": 2.991731331891617, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.022722005844116, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8623373508453369, + "num_tokens": 897196551.0, + "step": 23518 + }, + { + "epoch": 2.991858542170207, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7882384061813354, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8762314915657043, + "num_tokens": 897231885.0, + "step": 23519 + }, + { + "epoch": 2.991985752448798, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.091947317123413, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.865323007106781, + "num_tokens": 897267944.0, + "step": 23520 + }, + { + "epoch": 2.992112962727388, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.785064935684204, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8690125346183777, + "num_tokens": 897307330.0, + "step": 23521 + }, + { + "epoch": 2.992240173005979, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2008488178253174, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8671966195106506, + "num_tokens": 897344028.0, + "step": 23522 + }, + { + "epoch": 2.992367383284569, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.809234380722046, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8590731620788574, + "num_tokens": 897391213.0, + "step": 23523 + }, + { + "epoch": 2.99249459356316, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9200359582901, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8609800338745117, + "num_tokens": 897428222.0, + "step": 23524 + }, + { + "epoch": 2.9926218038417502, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0337839126586914, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8535434007644653, + "num_tokens": 897457737.0, + "step": 23525 + }, + { + "epoch": 2.992749014120341, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8291208744049072, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8754268884658813, + "num_tokens": 897498925.0, + "step": 23526 + }, + { + "epoch": 2.9928762243989313, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0746772289276123, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8603070378303528, + "num_tokens": 897533157.0, + "step": 23527 + }, + { + "epoch": 2.993003434677522, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8404383659362793, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8685396909713745, + "num_tokens": 897574159.0, + "step": 23528 + }, + { + "epoch": 2.9931306449561124, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.979343056678772, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.857664942741394, + "num_tokens": 897606970.0, + "step": 23529 + }, + { + "epoch": 2.993257855234703, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.6305553913116455, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.841066300868988, + "num_tokens": 897647725.0, + "step": 23530 + }, + { + "epoch": 2.9933850655132934, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.880948543548584, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8686923384666443, + "num_tokens": 897687109.0, + "step": 23531 + }, + { + "epoch": 2.993512275791884, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9905505180358887, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8771676421165466, + "num_tokens": 897720994.0, + "step": 23532 + }, + { + "epoch": 2.9936394860704745, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.119680881500244, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.882085382938385, + "num_tokens": 897749855.0, + "step": 23533 + }, + { + "epoch": 2.993766696349065, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9641056060791016, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8630668520927429, + "num_tokens": 897792846.0, + "step": 23534 + }, + { + "epoch": 2.9938939066276555, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7498270273208618, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8682070970535278, + "num_tokens": 897836765.0, + "step": 23535 + }, + { + "epoch": 2.994021116906246, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7743611335754395, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8673742413520813, + "num_tokens": 897880918.0, + "step": 23536 + }, + { + "epoch": 2.9941483271848366, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.05202054977417, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8630395531654358, + "num_tokens": 897916676.0, + "step": 23537 + }, + { + "epoch": 2.994275537463427, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0040342807769775, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8716630935668945, + "num_tokens": 897953968.0, + "step": 23538 + }, + { + "epoch": 2.9944027477420176, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9776643514633179, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8667250871658325, + "num_tokens": 897993055.0, + "step": 23539 + }, + { + "epoch": 2.994529958020608, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9462960958480835, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8740607500076294, + "num_tokens": 898030202.0, + "step": 23540 + }, + { + "epoch": 2.9946571682991987, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.9037375450134277, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8615239262580872, + "num_tokens": 898067234.0, + "step": 23541 + }, + { + "epoch": 2.9947843785777892, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9289178848266602, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8698733448982239, + "num_tokens": 898105423.0, + "step": 23542 + }, + { + "epoch": 2.9949115888563798, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9228726625442505, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8854238390922546, + "num_tokens": 898144550.0, + "step": 23543 + }, + { + "epoch": 2.99503879913497, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0260555744171143, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8690835237503052, + "num_tokens": 898179277.0, + "step": 23544 + }, + { + "epoch": 2.995166009413561, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.893802523612976, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8659429550170898, + "num_tokens": 898218518.0, + "step": 23545 + }, + { + "epoch": 2.995293219692151, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.060148239135742, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8546615242958069, + "num_tokens": 898259266.0, + "step": 23546 + }, + { + "epoch": 2.995420429970742, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0686709880828857, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8639734983444214, + "num_tokens": 898292648.0, + "step": 23547 + }, + { + "epoch": 2.995547640249332, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7576955556869507, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8692665100097656, + "num_tokens": 898328413.0, + "step": 23548 + }, + { + "epoch": 2.995674850527923, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.582348346710205, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8689209222793579, + "num_tokens": 898376963.0, + "step": 23549 + }, + { + "epoch": 2.995802060806513, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9399875402450562, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.878013014793396, + "num_tokens": 898407256.0, + "step": 23550 + }, + { + "epoch": 2.9959292710851035, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7925769090652466, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8696931600570679, + "num_tokens": 898448800.0, + "step": 23551 + }, + { + "epoch": 2.996056481363694, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8278743028640747, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8626005053520203, + "num_tokens": 898488912.0, + "step": 23552 + }, + { + "epoch": 2.9961836916422846, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.750741958618164, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8607347011566162, + "num_tokens": 898530434.0, + "step": 23553 + }, + { + "epoch": 2.996310901920875, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9517278671264648, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8671396374702454, + "num_tokens": 898565044.0, + "step": 23554 + }, + { + "epoch": 2.9964381121994657, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9558216333389282, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8764932155609131, + "num_tokens": 898601798.0, + "step": 23555 + }, + { + "epoch": 2.996565322478056, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8730732202529907, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8568205833435059, + "num_tokens": 898644789.0, + "step": 23556 + }, + { + "epoch": 2.9966925327566467, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.827690839767456, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8662526607513428, + "num_tokens": 898683928.0, + "step": 23557 + }, + { + "epoch": 2.9968197430352372, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9209328889846802, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8606984615325928, + "num_tokens": 898726217.0, + "step": 23558 + }, + { + "epoch": 2.9969469533138278, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.977642297744751, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8752838969230652, + "num_tokens": 898760575.0, + "step": 23559 + }, + { + "epoch": 2.9970741635924183, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8485037088394165, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8633445501327515, + "num_tokens": 898801232.0, + "step": 23560 + }, + { + "epoch": 2.997201373871009, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7940013408660889, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8720206022262573, + "num_tokens": 898837174.0, + "step": 23561 + }, + { + "epoch": 2.9973285841495994, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8201621770858765, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8575534224510193, + "num_tokens": 898875849.0, + "step": 23562 + }, + { + "epoch": 2.99745579442819, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8816639184951782, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8759539127349854, + "num_tokens": 898909975.0, + "step": 23563 + }, + { + "epoch": 2.9975830047067804, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2198004722595215, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8577395677566528, + "num_tokens": 898948250.0, + "step": 23564 + }, + { + "epoch": 2.997710214985371, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 80.52141571044922, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8835996985435486, + "num_tokens": 898989122.0, + "step": 23565 + }, + { + "epoch": 2.9978374252639615, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.126361131668091, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8617415428161621, + "num_tokens": 899028715.0, + "step": 23566 + }, + { + "epoch": 2.997964635542552, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.058117151260376, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8691877126693726, + "num_tokens": 899064101.0, + "step": 23567 + }, + { + "epoch": 2.9980918458211425, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9326623678207397, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8707823157310486, + "num_tokens": 899101759.0, + "step": 23568 + }, + { + "epoch": 2.9982190560997326, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.807619333267212, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8596301674842834, + "num_tokens": 899142273.0, + "step": 23569 + }, + { + "epoch": 2.9983462663783236, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.972607135772705, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8679362535476685, + "num_tokens": 899175878.0, + "step": 23570 + }, + { + "epoch": 2.9984734766569137, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8696428537368774, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8776094913482666, + "num_tokens": 899208937.0, + "step": 23571 + }, + { + "epoch": 2.9986006869355046, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9051344394683838, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8662265539169312, + "num_tokens": 899244088.0, + "step": 23572 + }, + { + "epoch": 2.9987278972140947, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9760239124298096, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8623069524765015, + "num_tokens": 899280111.0, + "step": 23573 + }, + { + "epoch": 2.9988551074926852, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.160475492477417, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8656119108200073, + "num_tokens": 899310882.0, + "step": 23574 + }, + { + "epoch": 2.9989823177712758, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.837573766708374, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8532054424285889, + "num_tokens": 899346820.0, + "step": 23575 + }, + { + "epoch": 2.9991095280498663, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7812751531600952, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8796887397766113, + "num_tokens": 899382044.0, + "step": 23576 + }, + { + "epoch": 2.999236738328457, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.9715368747711182, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8671000003814697, + "num_tokens": 899418389.0, + "step": 23577 + }, + { + "epoch": 2.9993639486070474, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7766549587249756, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8645281791687012, + "num_tokens": 899458486.0, + "step": 23578 + }, + { + "epoch": 2.999491158885638, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8337938785552979, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8650719523429871, + "num_tokens": 899498168.0, + "step": 23579 + }, + { + "epoch": 2.9996183691642284, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.754211187362671, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8825895190238953, + "num_tokens": 899538655.0, + "step": 23580 + }, + { + "epoch": 2.999745579442819, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.925632357597351, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8553794026374817, + "num_tokens": 899581693.0, + "step": 23581 + }, + { + "epoch": 2.9998727897214095, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.8010005950927734, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8645663261413574, + "num_tokens": 899623817.0, + "step": 23582 + }, + { + "epoch": 3.0, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.0259485244750977, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8708186149597168, + "num_tokens": 899664226.0, + "step": 23583 + }, + { + "epoch": 3.0, + "ewc_loss": 8.285045623779297e-06, + "step": 23583, + "total_flos": 4.051147348618982e+19, + "train_loss": 0.44752783132196916, + "train_runtime": 39436.9749, + "train_samples_per_second": 9.567, + "train_steps_per_second": 0.598 + } + ], + "logging_steps": 1, + "max_steps": 23583, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 11792, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.051147348618982e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..d38df53 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cbc31391cd2cf86e93ae3203510d2a29baf921a27d5d2002934cd4774d9e54b +size 13393